diff options
Diffstat (limited to 'fs')
185 files changed, 4132 insertions, 4636 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 42e102e2e74a..85ff859d3af5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -859,8 +859,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, static int v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode, - int *opened) + struct file *file, unsigned flags, umode_t mode) { int err; u32 perm; @@ -917,7 +916,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9inode->writeback_fid = (void *) inode_fid; } mutex_unlock(&v9inode->v_mutex); - err = finish_open(file, dentry, generic_file_open, opened); + err = finish_open(file, dentry, generic_file_open); if (err) goto error; @@ -925,7 +924,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(d_inode(dentry), file); - *opened |= FILE_CREATED; + file->f_mode |= FMODE_CREATED; out: dput(res); return err; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 7f6ae21a27b3..4823e1c46999 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -241,8 +241,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, static int v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t omode, - int *opened) + struct file *file, unsigned flags, umode_t omode) { int err = 0; kgid_t gid; @@ -352,13 +351,13 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } mutex_unlock(&v9inode->v_mutex); /* Since we are opening a file, assign the open fid to the file */ - err = finish_open(file, dentry, generic_file_open, opened); + err = finish_open(file, dentry, generic_file_open); if (err) goto err_clunk_old_fid; file->private_data = ofid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); - *opened |= FILE_CREATED; + file->f_mode |= FMODE_CREATED; out: v9fs_put_acl(dacl, pacl); dput(res); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index c836c425ca94..e91028d4340a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -287,7 +287,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->mmu_private = inode->i_size; } - insert_inode_hash(inode); + inode_fake_hash(inode); out: return inode; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 71fa525d63a0..7e099a7a4eb1 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -291,6 +291,7 @@ static void destroy_inodecache(void) static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, .destroy_inode = adfs_destroy_inode, + .drop_inode = generic_delete_inode, .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 7d623008157f..855bf2b79fed 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -822,6 +822,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, { struct afs_vnode *dvnode = AFS_FS_I(dir); struct inode *inode; + struct dentry *d; struct key *key; int ret; @@ -862,43 +863,17 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, afs_stat_v(dvnode, n_lookup); inode = afs_do_lookup(dir, dentry, key); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) { - inode = afs_try_auto_mntpt(dentry, dir); - if (!IS_ERR(inode)) { - key_put(key); - goto success; - } - - ret = PTR_ERR(inode); - } - - key_put(key); - if (ret == -ENOENT) { - d_add(dentry, NULL); - _leave(" = NULL [negative]"); - return NULL; - } - _leave(" = %d [do]", ret); - return ERR_PTR(ret); - } - dentry->d_fsdata = (void *)(unsigned long)dvnode->status.data_version; - - /* instantiate the dentry */ key_put(key); - if (IS_ERR(inode)) { - _leave(" = %ld", PTR_ERR(inode)); - return ERR_CAST(inode); + if (inode == ERR_PTR(-ENOENT)) { + inode = afs_try_auto_mntpt(dentry, dir); + } else { + dentry->d_fsdata = + (void *)(unsigned long)dvnode->status.data_version; } - -success: - d_add(dentry, inode); - _leave(" = 0 { ino=%lu v=%u }", - d_inode(dentry)->i_ino, - d_inode(dentry)->i_generation); - - return NULL; + d = d_splice_alias(inode, dentry); + if (!IS_ERR_OR_NULL(d)) + d->d_fsdata = dentry->d_fsdata; + return d; } /* diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 174e843f0633..1cde710a8013 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -83,7 +83,7 @@ struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) out: _leave("= %d", ret); - return ERR_PTR(ret); + return ret == -ENOENT ? NULL : ERR_PTR(ret); } /* @@ -141,12 +141,6 @@ out_p: static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode; - struct inode *inode; - int ret; - - vnode = AFS_FS_I(dir); - _enter("%pd", dentry); ASSERTCMP(d_inode(dentry), ==, NULL); @@ -160,22 +154,7 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr memcmp(dentry->d_name.name, "@cell", 5) == 0) return afs_lookup_atcell(dentry); - inode = afs_try_auto_mntpt(dentry, dir); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) { - d_add(dentry, NULL); - _leave(" = NULL [negative]"); - return NULL; - } - _leave(" = %d [do]", ret); - return ERR_PTR(ret); - } - - d_add(dentry, inode); - _leave(" = 0 { ino=%lu v=%u }", - d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); - return NULL; + return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry); } const struct inode_operations afs_dynroot_inode_operations = { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a1b18082991b..183cc5418722 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -648,7 +648,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, trace_afs_notify_call(rxcall, call); call->need_attention = true; - u = __atomic_add_unless(&call->usage, 1, 0); + u = atomic_fetch_add_unless(&call->usage, 1, 0); if (u != 0) { trace_afs_call(call, afs_call_trace_wake, u, atomic_read(&call->net->nr_outstanding_calls), @@ -19,6 +19,7 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> +#include <linux/refcount.h> #include <linux/uio.h> #include <linux/sched/signal.h> @@ -167,13 +168,12 @@ struct fsync_iocb { struct poll_iocb { struct file *file; - __poll_t events; struct wait_queue_head *head; - - union { - struct wait_queue_entry wait; - struct work_struct work; - }; + __poll_t events; + bool woken; + bool cancelled; + struct wait_queue_entry wait; + struct work_struct work; }; struct aio_kiocb { @@ -191,6 +191,7 @@ struct aio_kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ + refcount_t ki_refcnt; /* * If the aio_resfd field of the userspace iocb is not zero, @@ -215,9 +216,7 @@ static const struct address_space_operations aio_ctx_aops; static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { - struct qstr this = QSTR_INIT("[aio]", 5); struct file *file; - struct path path; struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -226,31 +225,17 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) inode->i_mapping->private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; - path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); - if (!path.dentry) { + file = alloc_file_pseudo(inode, aio_mnt, "[aio]", + O_RDWR, &aio_ring_fops); + if (IS_ERR(file)) iput(inode); - return ERR_PTR(-ENOMEM); - } - path.mnt = mntget(aio_mnt); - - d_instantiate(path.dentry, inode); - file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops); - if (IS_ERR(file)) { - path_put(&path); - return file; - } - - file->f_flags = O_RDWR; return file; } static struct dentry *aio_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - static const struct dentry_operations ops = { - .d_dname = simple_dname, - }; - struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL, AIO_RING_MAGIC); if (!IS_ERR(root)) @@ -1028,6 +1013,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) percpu_ref_get(&ctx->reqs); INIT_LIST_HEAD(&req->ki_list); + refcount_set(&req->ki_refcnt, 0); req->ki_ctx = ctx; return req; out_put: @@ -1062,6 +1048,15 @@ out: return ret; } +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_read(&iocb->ki_refcnt) == 0 || + refcount_dec_and_test(&iocb->ki_refcnt)) { + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); + } +} + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1131,8 +1126,6 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) eventfd_ctx_put(iocb->ki_eventfd); } - kmem_cache_free(kiocb_cachep, iocb); - /* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is @@ -1143,8 +1136,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - - percpu_ref_put(&ctx->reqs); + iocb_put(iocb); } /* aio_read_events_ring @@ -1590,6 +1582,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; + req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; @@ -1604,46 +1597,58 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return 0; } -/* need to use list_del_init so we can check if item was present */ -static inline bool __aio_poll_remove(struct poll_iocb *req) +static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) { - if (list_empty(&req->wait.entry)) - return false; - list_del_init(&req->wait.entry); - return true; -} + struct file *file = iocb->poll.file; -static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - fput(iocb->poll.file); aio_complete(iocb, mangle_poll(mask), 0); + fput(file); } -static void aio_poll_work(struct work_struct *work) +static void aio_poll_complete_work(struct work_struct *work) { - struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); + struct poll_iocb *req = container_of(work, struct poll_iocb, work); + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); + struct poll_table_struct pt = { ._key = req->events }; + struct kioctx *ctx = iocb->ki_ctx; + __poll_t mask = 0; - if (!list_empty_careful(&iocb->ki_list)) - aio_remove_iocb(iocb); - __aio_poll_complete(iocb, iocb->poll.events); + if (!READ_ONCE(req->cancelled)) + mask = vfs_poll(req->file, &pt) & req->events; + + /* + * Note that ->ki_cancel callers also delete iocb from active_reqs after + * calling ->ki_cancel. We need the ctx_lock roundtrip here to + * synchronize with them. In the cancellation case the list_del_init + * itself is not actually needed, but harmless so we keep it in to + * avoid further branches in the fast path. + */ + spin_lock_irq(&ctx->ctx_lock); + if (!mask && !READ_ONCE(req->cancelled)) { + add_wait_queue(req->head, &req->wait); + spin_unlock_irq(&ctx->ctx_lock); + return; + } + list_del_init(&iocb->ki_list); + spin_unlock_irq(&ctx->ctx_lock); + + aio_poll_complete(iocb, mask); } +/* assumes we are called with irqs disabled */ static int aio_poll_cancel(struct kiocb *iocb) { struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; - struct wait_queue_head *head = req->head; - bool found = false; - - spin_lock(&head->lock); - found = __aio_poll_remove(req); - spin_unlock(&head->lock); - if (found) { - req->events = 0; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); + spin_lock(&req->head->lock); + WRITE_ONCE(req->cancelled, true); + if (!list_empty(&req->wait.entry)) { + list_del_init(&req->wait.entry); + schedule_work(&aiocb->poll.work); } + spin_unlock(&req->head->lock); + return 0; } @@ -1652,44 +1657,59 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, { struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); - struct file *file = req->file; __poll_t mask = key_to_poll(key); - assert_spin_locked(&req->head->lock); + req->woken = true; /* for instances that support it check for an event match first: */ - if (mask && !(mask & req->events)) - return 0; + if (mask) { + if (!(mask & req->events)) + return 0; + + /* try to complete the iocb inline if we can: */ + if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { + list_del(&iocb->ki_list); + spin_unlock(&iocb->ki_ctx->ctx_lock); + + list_del_init(&req->wait.entry); + aio_poll_complete(iocb, mask); + return 1; + } + } - mask = file->f_op->poll_mask(file, req->events) & req->events; - if (!mask) - return 0; + list_del_init(&req->wait.entry); + schedule_work(&req->work); + return 1; +} - __aio_poll_remove(req); +struct aio_poll_table { + struct poll_table_struct pt; + struct aio_kiocb *iocb; + int error; +}; - /* - * Try completing without a context switch if we can acquire ctx_lock - * without spinning. Otherwise we need to defer to a workqueue to - * avoid a deadlock due to the lock order. - */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del_init(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); +static void +aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); - __aio_poll_complete(iocb, mask); - } else { - req->events = mask; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); + /* multiple wait queues per file are not supported */ + if (unlikely(pt->iocb->poll.head)) { + pt->error = -EINVAL; + return; } - return 1; + pt->error = 0; + pt->iocb->poll.head = head; + add_wait_queue(head, &pt->iocb->poll.wait); } static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; + struct aio_poll_table apt; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1699,40 +1719,58 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) return -EINVAL; + INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; - if (!file_has_poll_mask(req->file)) - goto out_fail; - - req->head = req->file->f_op->get_poll_head(req->file, req->events); - if (!req->head) - goto out_fail; - if (IS_ERR(req->head)) { - mask = EPOLLERR; - goto done; - } + apt.pt._qproc = aio_poll_queue_proc; + apt.pt._key = req->events; + apt.iocb = aiocb; + apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ + + /* initialized the list so that we can do list_empty checks */ + INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - aiocb->ki_cancel = aio_poll_cancel; + + /* one for removal from waitqueue, one for this function */ + refcount_set(&aiocb->ki_refcnt, 2); + + mask = vfs_poll(req->file, &apt.pt) & req->events; + if (unlikely(!req->head)) { + /* we did not manage to set up a waitqueue, done */ + goto out; + } spin_lock_irq(&ctx->ctx_lock); spin_lock(&req->head->lock); - mask = req->file->f_op->poll_mask(req->file, req->events) & req->events; - if (!mask) { - __add_wait_queue(req->head, &req->wait); + if (req->woken) { + /* wake_up context handles the rest */ + mask = 0; + apt.error = 0; + } else if (mask || apt.error) { + /* if we get an error or a mask we are done */ + WARN_ON_ONCE(list_empty(&req->wait.entry)); + list_del_init(&req->wait.entry); + } else { + /* actually waiting for an event */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; } spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); -done: + +out: + if (unlikely(apt.error)) { + fput(req->file); + return apt.error; + } + if (mask) - __aio_poll_complete(aiocb, mask); + aio_poll_complete(aiocb, mask); + iocb_put(aiocb); return 0; -out_fail: - fput(req->file); - return -EINVAL; /* same as no support for IOCB_CMD_POLL */ } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, @@ -2042,6 +2080,11 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, return ret; } +struct __aio_sigset { + const sigset_t __user *sigmask; + size_t sigsetsize; +}; + SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 3168ee4e77f4..91262c34b797 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -71,8 +71,6 @@ struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags) { - struct qstr this; - struct path path; struct file *file; if (IS_ERR(anon_inode_inode)) @@ -82,39 +80,23 @@ struct file *anon_inode_getfile(const char *name, return ERR_PTR(-ENOENT); /* - * Link the inode to a directory entry by creating a unique name - * using the inode sequence number. - */ - file = ERR_PTR(-ENOMEM); - this.name = name; - this.len = strlen(name); - this.hash = 0; - path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); - if (!path.dentry) - goto err_module; - - path.mnt = mntget(anon_inode_mnt); - /* * We know the anon_inode inode count is always greater than zero, * so ihold() is safe. */ ihold(anon_inode_inode); - - d_instantiate(path.dentry, anon_inode_inode); - - file = alloc_file(&path, OPEN_FMODE(flags), fops); + file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name, + flags & (O_ACCMODE | O_NONBLOCK), fops); if (IS_ERR(file)) - goto err_dput; + goto err; + file->f_mapping = anon_inode_inode->i_mapping; - file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->private_data = priv; return file; -err_dput: - path_put(&path); -err_module: +err: + iput(anon_inode_inode); module_put(fops->owner); return file; } diff --git a/fs/attr.c b/fs/attr.c index e3d53bf12240..d22e8187477f 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -120,7 +120,6 @@ EXPORT_SYMBOL(setattr_prepare); * inode_newsize_ok - may this inode be truncated to a given size * @inode: the inode to be truncated * @offset: the new size to assign to the inode - * @Returns: 0 on success, -ve errno on failure * * inode_newsize_ok must be called with i_mutex held. * @@ -130,6 +129,8 @@ EXPORT_SYMBOL(setattr_prepare); * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). + * + * Return: 0 on success, -ve errno on failure */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { @@ -205,7 +206,7 @@ EXPORT_SYMBOL(setattr_copy); /** * notify_change - modify attributes of a filesytem object * @dentry: object affected - * @iattr: new attributes + * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * * The caller must hold the i_mutex on the affected object. diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile index 43fedde15c26..1f85d35ec8b7 100644 --- a/fs/autofs/Makefile +++ b/fs/autofs/Makefile @@ -2,6 +2,6 @@ # Makefile for the linux autofs-filesystem routines. # -obj-$(CONFIG_AUTOFS_FS) += autofs.o +obj-$(CONFIG_AUTOFS_FS) += autofs4.o -autofs-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o +autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index ea4ca1445ab7..86eafda4a652 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -135,6 +135,15 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) cmd); goto out; } + } else { + unsigned int inr = _IOC_NR(cmd); + + if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || + inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || + inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { + err = -EINVAL; + goto out; + } } err = 0; @@ -271,7 +280,8 @@ static int autofs_dev_ioctl_openmount(struct file *fp, dev_t devid; int err, fd; - /* param->path has already been checked */ + /* param->path has been checked in validate_dev_ioctl() */ + if (!param->openmount.devid) return -EINVAL; @@ -433,10 +443,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, dev_t devid; int err = -ENOENT; - if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { - err = -EINVAL; - goto out; - } + /* param->path has been checked in validate_dev_ioctl() */ devid = sbi->sb->s_dev; @@ -521,10 +528,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, unsigned int devid, magic; int err = -ENOENT; - if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { - err = -EINVAL; - goto out; - } + /* param->path has been checked in validate_dev_ioctl() */ name = param->path; type = param->ismountpoint.in.type; diff --git a/fs/autofs/init.c b/fs/autofs/init.c index cc9447e1903f..79ae07d9592f 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -23,7 +23,7 @@ static struct file_system_type autofs_fs_type = { .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); -MODULE_ALIAS("autofs4"); +MODULE_ALIAS("autofs"); static int __init init_autofs_fs(void) { diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 125e8bbd22a2..8035d2a44561 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -134,7 +134,7 @@ static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, - umode_t create_mode, int *opened) + umode_t create_mode) { return -EIO; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0ac456b52bdd..efae2fb0930a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1259,9 +1259,8 @@ static int load_elf_library(struct file *file) goto out_free_ph; } - len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + - ELF_MIN_ALIGN - 1); - bss = eppnt->p_memsz + eppnt->p_vaddr; + len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr); + bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr); if (bss > len) { error = vm_brk(len, bss - len); if (error) @@ -1752,7 +1751,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); if (regset->core_note_type && regset->get && - (!regset->active || regset->active(t->task, regset))) { + (!regset->active || regset->active(t->task, regset) > 0)) { int ret; size_t size = regset_size(t->task, regset); void *data = kmalloc(size, GFP_KERNEL); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 4b5fff31ef27..aa4a7a23ff99 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -205,7 +205,7 @@ static int load_misc_binary(struct linux_binprm *bprm) goto error; if (fmt->flags & MISC_FMT_OPEN_FILE) { - interp_file = filp_clone_open(fmt->interp_file); + interp_file = file_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { diff --git a/fs/block_dev.c b/fs/block_dev.c index 0dd87aaeb39a..aba25414231a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -221,7 +221,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) - return ret; + goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == READ) { @@ -250,12 +250,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, put_page(bvec->bv_page); } - if (vecs != inline_vecs) - kfree(vecs); - if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); +out: + if (vecs != inline_vecs) + kfree(vecs); + bio_uninit(&bio); return ret; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 15e1dfef56a5..3b66c957ea6f 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -30,23 +30,22 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: - BUG(); + return ERR_PTR(-EINVAL); } - size = btrfs_getxattr(inode, name, "", 0); + size = btrfs_getxattr(inode, name, NULL, 0); if (size > 0) { value = kzalloc(size, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); size = btrfs_getxattr(inode, name, value, size); } - if (size > 0) { + if (size > 0) acl = posix_acl_from_xattr(&init_user_ns, value, size); - } else if (size == -ERANGE || size == -ENODATA || size == 0) { + else if (size == -ENODATA || size == 0) acl = NULL; - } else { - acl = ERR_PTR(-EIO); - } + else + acl = ERR_PTR(size); kfree(value); return acl; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0a8e2e29a66b..ae750b1574a2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -925,7 +925,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); if (type == BTRFS_REF_TYPE_INVALID) - return -EINVAL; + return -EUCLEAN; offset = btrfs_extent_inline_ref_offset(leaf, iref); @@ -1793,7 +1793,7 @@ static int get_extent_inline_ref(unsigned long *ptr, *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, BTRFS_REF_TYPE_ANY); if (*out_type == BTRFS_REF_TYPE_INVALID) - return -EINVAL; + return -EUCLEAN; *ptr += btrfs_extent_inline_ref_size(*out_type); WARN_ON(*ptr > end); @@ -2225,7 +2225,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, fspath = init_data_container(total_bytes); if (IS_ERR(fspath)) - return (void *)fspath; + return ERR_CAST(fspath); ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); if (!ifp) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7e075343daa5..1343ac57b438 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -178,7 +178,7 @@ struct btrfs_inode { struct btrfs_delayed_node *delayed_node; /* File creation time. */ - struct timespec i_otime; + struct timespec64 i_otime; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index a3fdb4fe967d..833cf3c35b4d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1539,7 +1539,12 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, } device = multi->stripes[0].dev; - block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev->bd_dev); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || + !device->bdev || !device->name) + block_ctx_out->dev = NULL; + else + block_ctx_out->dev = btrfsic_dev_state_lookup( + device->bdev->bd_dev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; @@ -1624,7 +1629,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, bio = btrfs_io_bio_alloc(num_pages - i); bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d3e447b45bf7..9bfa66592aa7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -5,7 +5,6 @@ #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -14,10 +13,7 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> -#include <linux/mpage.h> -#include <linux/swap.h> #include <linux/writeback.h> -#include <linux/bit_spinlock.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> @@ -303,7 +299,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, struct bio *bio = NULL; struct compressed_bio *cb; unsigned long bytes_left; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int pg_index = 0; struct page *page; u64 first_byte = disk_start; @@ -342,9 +337,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - submit = io_tree->ops->merge_bio_hook(page, 0, - PAGE_SIZE, - bio, 0); + submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < @@ -613,7 +606,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = bio->bi_iter.bi_size; comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); - bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); + comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; refcount_set(&cb->pending_bios, 1); @@ -626,9 +619,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - submit = tree->ops->merge_bio_hook(page, 0, - PAGE_SIZE, - comp_bio, 0); + submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, + comp_bio, 0); page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < @@ -660,7 +652,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); - bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); + comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4bc326df472e..d436fb4c002e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -888,11 +888,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - return 1; -#endif + return 0; } @@ -3128,8 +3124,7 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -3181,7 +3176,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(fs_info, path, &disk_key, 1); + fixup_low_keys(path, &disk_key, 1); } /* @@ -3359,17 +3354,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, root_add_used(root, fs_info->nodesize); - memzero_extent_buffer(c, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); - btrfs_set_header_level(c, level); - btrfs_set_header_bytenr(c, c->start); - btrfs_set_header_generation(c, trans->transid); - btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(c, root->root_key.objectid); - - write_extent_buffer_fsid(c, fs_info->fsid); - write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid); - btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); lower_gen = btrfs_header_generation(lower); @@ -3498,15 +3483,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, return PTR_ERR(split); root_add_used(root, fs_info->nodesize); - - memzero_extent_buffer(split, 0, sizeof(struct btrfs_header)); - btrfs_set_header_level(split, btrfs_header_level(c)); - btrfs_set_header_bytenr(split, split->start); - btrfs_set_header_generation(split, trans->transid); - btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(split, root->root_key.objectid); - write_extent_buffer_fsid(split, fs_info->fsid); - write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid); + ASSERT(btrfs_header_level(c) == level); ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { @@ -3945,7 +3922,7 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, clean_tree_block(fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(fs_info, path, &disk_key, 1); + fixup_low_keys(path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -4292,15 +4269,6 @@ again: root_add_used(root, fs_info->nodesize); - memzero_extent_buffer(right, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(right, right->start); - btrfs_set_header_generation(right, trans->transid); - btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(right, root->root_key.objectid); - btrfs_set_header_level(right, 0); - write_extent_buffer_fsid(right, fs_info->fsid); - write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid); - if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); @@ -4320,7 +4288,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(fs_info, path, &disk_key, 1); + fixup_low_keys(path, &disk_key, 1); } /* * We create a new leaf 'right' for the required ins_len and @@ -4642,7 +4610,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(fs_info, path, &disk_key, 1); + fixup_low_keys(path, &disk_key, 1); } item = btrfs_item_nr(slot); @@ -4744,7 +4712,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(fs_info, path, &disk_key, 1); + fixup_low_keys(path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4886,7 +4854,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *parent = path->nodes[level]; u32 nritems; int ret; @@ -4919,7 +4886,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(fs_info, path, &disk_key, level + 1); + fixup_low_keys(path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -5022,7 +4989,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(fs_info, path, &disk_key, 1); + fixup_low_keys(path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 118346aceea9..318be7864072 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -55,8 +55,6 @@ struct btrfs_ordered_sum; #define BTRFS_OLDEST_GENERATION 0ULL -#define BTRFS_COMPAT_EXTENT_TREE_V0 - /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -86,6 +84,14 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_DIRTY_METADATA_THRESH SZ_32M +/* + * Use large batch size to reduce overhead of metadata updates. On the reader + * side, we only read it when we are close to ENOSPC and the read overhead is + * mostly related to the number of CPUs, so it is OK to use arbitrary large + * value here. + */ +#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M + #define BTRFS_MAX_EXTENT_SIZE SZ_128M @@ -342,8 +348,8 @@ struct btrfs_path { sizeof(struct btrfs_item)) struct btrfs_dev_replace { u64 replace_state; /* see #define above */ - u64 time_started; /* seconds since 1-Jan-1970 */ - u64 time_stopped; /* seconds since 1-Jan-1970 */ + time64_t time_started; /* seconds since 1-Jan-1970 */ + time64_t time_stopped; /* seconds since 1-Jan-1970 */ atomic64_t num_write_errors; atomic64_t num_uncorrectable_read_errors; @@ -359,8 +365,6 @@ struct btrfs_dev_replace { struct btrfs_device *srcdev; struct btrfs_device *tgtdev; - pid_t lock_owner; - atomic_t nesting_level; struct mutex lock_finishing_cancel_unmount; rwlock_t lock; atomic_t read_locks; @@ -1213,7 +1217,6 @@ struct btrfs_root { u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - char *name; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; @@ -2428,32 +2431,6 @@ static inline u32 btrfs_file_extent_inline_item_len( return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } -/* this returns the number of file bytes represented by the inline item. - * If an item is compressed, this is the uncompressed size - */ -static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb, - int slot, - const struct btrfs_file_extent_item *fi) -{ - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - /* - * return the space used on disk if this item isn't - * compressed or encoded - */ - if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 && - btrfs_token_file_extent_encryption(eb, fi, &token) == 0 && - btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) { - return btrfs_file_extent_inline_item_len(eb, - btrfs_item_nr(slot)); - } - - /* otherwise use the ram bytes field */ - return btrfs_token_file_extent_ram_bytes(eb, fi, &token); -} - - /* btrfs_dev_stats_item */ static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, const struct btrfs_dev_stats_item *ptr, @@ -2676,7 +2653,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, @@ -2716,15 +2692,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytes_used, - u64 type, u64 chunk_offset, u64 size); + u64 bytes_used, u64 type, u64 chunk_offset, + u64 size); void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 group_start, - struct extent_map *em); + u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); @@ -2786,7 +2761,6 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); @@ -2803,8 +2777,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache); +int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); @@ -2812,8 +2785,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 type); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); @@ -2822,10 +2794,10 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); -void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, const u64 type); +void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); u64 add_new_free_space(struct btrfs_block_group_cache *block_group, u64 start, u64 end); +void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, @@ -3011,16 +2983,14 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const char *name, int name_len); -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const char *name, int name_len); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, const char *name, + int name_len); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, const char *name, + int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, const struct btrfs_key *key); + const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); @@ -3196,7 +3166,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); -void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); +void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3452,7 +3422,7 @@ do { \ #ifdef CONFIG_BTRFS_ASSERT __cold -static inline void assfail(char *expr, char *file, int line) +static inline void assfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, file: %s, line: %d\n", expr, file, line); @@ -3465,6 +3435,13 @@ static inline void assfail(char *expr, char *file, int line) #define ASSERT(expr) ((void)0) #endif +__cold +static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fe6caa7e698b..f51b509f2d9b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1222,7 +1222,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); struct btrfs_path *path; @@ -1418,7 +1418,6 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) /* Will return 0 or -ENOMEM */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_disk_key *disk_key, u8 type, @@ -1458,11 +1457,10 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, */ BUG_ON(ret); - mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - btrfs_err(fs_info, + btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", name_len, name, delayed_node->root->objectid, delayed_node->inode_id, ret); @@ -1495,7 +1493,6 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, } int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_inode *dir, u64 index) { struct btrfs_delayed_node *node; @@ -1511,7 +1508,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, item_key.type = BTRFS_DIR_INDEX_KEY; item_key.offset = index; - ret = btrfs_delete_delayed_insertion_item(fs_info, node, &item_key); + ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, + &item_key); if (!ret) goto end; @@ -1533,7 +1531,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); if (unlikely(ret)) { - btrfs_err(fs_info, + btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", index, node->root->objectid, node->inode_id, ret); BUG(); @@ -1837,7 +1835,7 @@ release_node: int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; /* diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index ca7a97f3ab6b..33536cd681d4 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -86,14 +86,12 @@ static inline void btrfs_init_delayed_root( } int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_disk_key *disk_key, u8 type, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_inode *dir, u64 index); int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 03dec673d12a..62ff545ba1f7 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -709,13 +709,13 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, int *old_ref_mod, int *new_ref_mod) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -730,27 +730,33 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, if (!ref) return -ENOMEM; + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); + if (!head_ref) { + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + return -ENOMEM; + } + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && + is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return -ENOMEM; + } + } + if (parent) ref_type = BTRFS_SHARED_BLOCK_REF_KEY; else ref_type = BTRFS_TREE_BLOCK_REF_KEY; + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, ref_root, action, ref_type); ref->root = ref_root; ref->parent = parent; ref->level = level; - head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) - goto free_ref; - - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root)) { - record = kmalloc(sizeof(*record), GFP_NOFS); - if (!record) - goto free_head_ref; - } - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, 0, action, false, is_system); head_ref->extent_op = extent_op; @@ -779,25 +785,18 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, btrfs_qgroup_trace_extent_post(fs_info, record); return 0; - -free_head_ref: - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); -free_ref: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - - return -ENOMEM; } /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ -int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, int *old_ref_mod, int *new_ref_mod) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ea1aecb6a50d..d9f2a4ebd5db 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -234,14 +234,12 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea kmem_cache_free(btrfs_delayed_ref_head_cachep, head); } -int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, int *old_ref_mod, int *new_ref_mod); -int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e2ba0419297a..dec01970d8c5 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -6,14 +6,9 @@ #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/random.h> -#include <linux/iocontext.h> -#include <linux/capability.h> #include <linux/kthread.h> #include <linux/math64.h> -#include <asm/div64.h> #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -465,7 +460,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, * go to the tgtdev as well (refer to btrfs_map_block()). */ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; - dev_replace->time_started = get_seconds(); + dev_replace->time_started = ktime_get_real_seconds(); dev_replace->cursor_left = 0; dev_replace->committed_cursor_left = 0; dev_replace->cursor_left_last_write_of_item = 0; @@ -511,7 +506,7 @@ leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; btrfs_dev_replace_write_unlock(dev_replace); - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -618,7 +613,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - dev_replace->time_stopped = get_seconds(); + dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; /* replace old device with new one in mapping tree */ @@ -637,7 +632,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device); btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -663,7 +658,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - btrfs_assign_next_active_device(fs_info, src_device, tgt_device); + btrfs_assign_next_active_device(src_device, tgt_device); list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; @@ -672,11 +667,17 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_rm_dev_replace_blocked(fs_info); - btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_remove_srcdev(src_device); btrfs_rm_dev_replace_unblocked(fs_info); /* + * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will + * update on-disk dev stats value during commit transaction + */ + atomic_inc(&tgt_device->dev_stats_ccnt); + + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the * source device is not part of the filesystem anymore and its 1st @@ -807,7 +808,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) break; } dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = get_seconds(); + dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; btrfs_dev_replace_write_unlock(dev_replace); btrfs_scrub_cancel(fs_info); @@ -826,7 +827,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_name(tgt_device)); if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(tgt_device); leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -848,7 +849,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; - dev_replace->time_stopped = get_seconds(); + dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; btrfs_info(fs_info, "suspending dev_replace for unmount"); break; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 39e9766d1cbd..a678b07fcf01 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -160,8 +160,8 @@ second_insert: } btrfs_release_path(path); - ret2 = btrfs_insert_delayed_dir_index(trans, root->fs_info, name, - name_len, dir, &disk_key, type, index); + ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir, + &disk_key, type, index); out_free: btrfs_free_path(path); if (ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 205092dc9390..5124c15705ce 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,8 +5,6 @@ #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/scatterlist.h> -#include <linux/swap.h> #include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -54,7 +52,6 @@ static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); -static void free_fs_root(struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -108,12 +105,9 @@ void __cold btrfs_end_io_wq_exit(void) */ struct async_submit_bio { void *private_data; - struct btrfs_fs_info *fs_info; struct bio *bio; extent_submit_bio_start_t *submit_bio_start; - extent_submit_bio_done_t *submit_bio_done; int mirror_num; - unsigned long bio_flags; /* * bio_offset is optional, can be used if the pages in the bio * can't tell us where in the file the bio should go @@ -212,7 +206,7 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -615,8 +609,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(fs_info, "bad tree block start %llu %llu", - found_start, eb->start); + btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", + eb->start, found_start); ret = -EIO; goto err; } @@ -628,8 +622,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "bad tree block level %d", - (int)btrfs_header_level(eb)); + btrfs_err(fs_info, "bad tree block level %d on %llu", + (int)btrfs_header_level(eb), eb->start); ret = -EIO; goto err; } @@ -779,7 +773,7 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->private_data, async->bio, async->mirror_num); + btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -793,8 +787,7 @@ static void run_one_async_free(struct btrfs_work *work) blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, - extent_submit_bio_start_t *submit_bio_start, - extent_submit_bio_done_t *submit_bio_done) + extent_submit_bio_start_t *submit_bio_start) { struct async_submit_bio *async; @@ -803,16 +796,13 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, return BLK_STS_RESOURCE; async->private_data = private_data; - async->fs_info = fs_info; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; - async->submit_bio_done = submit_bio_done; btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, run_one_async_done, run_one_async_free); - async->bio_flags = bio_flags; async->bio_offset = bio_offset; async->status = 0; @@ -851,24 +841,6 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, return btree_csum_one_bio(bio); } -static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num) -{ - struct inode *inode = private_data; - blk_status_t ret; - - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio - */ - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } - return ret; -} - static int check_async_write(struct btrfs_inode *bi) { if (atomic_read(&bi->sync_writers)) @@ -911,8 +883,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, bio_offset, private_data, - btree_submit_bio_start, - btree_submit_bio_done); + btree_submit_bio_start); } if (ret) @@ -961,8 +932,9 @@ static int btree_writepages(struct address_space *mapping, fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, - BTRFS_DIRTY_METADATA_THRESH); + ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH, + fs_info->dirty_metadata_batch); if (ret < 0) return 0; } @@ -1181,7 +1153,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->highest_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; - root->name = NULL; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; @@ -1292,15 +1263,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, goto fail; } - memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(leaf, objectid); root->node = leaf; - - write_extent_buffer_fsid(leaf, fs_info->fsid); - write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); @@ -1374,14 +1337,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, return ERR_CAST(leaf); } - memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; - write_extent_buffer_fsid(root->node, fs_info->fsid); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1546,7 +1503,7 @@ int btrfs_init_fs_root(struct btrfs_root *root) return 0; fail: - /* the caller is responsible to call free_fs_root */ + /* The caller is responsible to call btrfs_free_fs_root */ return ret; } @@ -1651,14 +1608,14 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { - free_fs_root(root); + btrfs_free_fs_root(root); goto again; } goto fail; } return root; fail: - free_fs_root(root); + btrfs_free_fs_root(root); return ERR_PTR(ret); } @@ -1803,7 +1760,7 @@ static int transaction_kthread(void *arg) struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; - unsigned long now; + time64_t now; unsigned long delay; bool cannot_commit; @@ -1819,7 +1776,7 @@ static int transaction_kthread(void *arg) goto sleep; } - now = get_seconds(); + now = ktime_get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || @@ -2196,8 +2153,6 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { - fs_info->dev_replace.lock_owner = 0; - atomic_set(&fs_info->dev_replace.nesting_level, 0); mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); rwlock_init(&fs_info->dev_replace.lock); atomic_set(&fs_info->dev_replace.read_locks, 0); @@ -3075,6 +3030,13 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; + ret = btrfs_verify_dev_extents(fs_info); + if (ret) { + btrfs_err(fs_info, + "failed to verify dev extents against chunks: %d", + ret); + goto fail_block_groups; + } ret = btrfs_recover_balance(fs_info); if (ret) { btrfs_err(fs_info, "failed to recover balance: %d", ret); @@ -3875,10 +3837,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, __btrfs_remove_free_space_cache(root->free_ino_pinned); if (root->free_ino_ctl) __btrfs_remove_free_space_cache(root->free_ino_ctl); - free_fs_root(root); + btrfs_free_fs_root(root); } -static void free_fs_root(struct btrfs_root *root) +void btrfs_free_fs_root(struct btrfs_root *root) { iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); @@ -3890,15 +3852,9 @@ static void free_fs_root(struct btrfs_root *root) free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); - kfree(root->name); btrfs_put_fs_root(root); } -void btrfs_free_fs_root(struct btrfs_root *root) -{ - free_fs_root(root); -} - int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; @@ -4104,10 +4060,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests - * enabled. Normal people shouldn't be marking dummy buffers as dirty + * enabled. Normal people shouldn't be using umapped buffers as dirty * outside of the sanity tests. */ - if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags))) + if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif root = BTRFS_I(buf->pages[0]->mapping->host)->root; @@ -4150,8 +4106,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, if (flush_delayed) btrfs_balance_delayed_items(fs_info); - ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, - BTRFS_DIRTY_METADATA_THRESH); + ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH, + fs_info->dirty_metadata_batch); if (ret > 0) { balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping); } @@ -4563,21 +4520,11 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } -static struct btrfs_fs_info *btree_fs_info(void *private_data) -{ - struct inode *inode = private_data; - return btrfs_sb(inode->i_sb); -} - static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, - /* note we're sharing with inode.c for the merge bio hook */ - .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btree_io_failed_hook, - .set_range_writeback = btrfs_set_range_writeback, - .tree_fs_info = btree_fs_info, /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 1a3d277b027b..4cccba22640f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,8 +120,9 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, - extent_submit_bio_start_t *submit_bio_start, - extent_submit_bio_done_t *submit_bio_done); + extent_submit_bio_start_t *submit_bio_start); +blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num); int btrfs_write_tree_block(struct extent_buffer *buf); void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d9fe58c0080..de6f75f5547b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -52,24 +52,21 @@ enum { }; static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_ref_node *node, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 flags, +static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); @@ -220,9 +217,9 @@ static int add_excluded_extent(struct btrfs_fs_info *fs_info, return 0; } -static void free_excluded_extents(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache) +static void free_excluded_extents(struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; u64 start, end; start = cache->key.objectid; @@ -234,9 +231,9 @@ static void free_excluded_extents(struct btrfs_fs_info *fs_info, start, end, EXTENT_UPTODATE); } -static int exclude_super_stripes(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache) +static int exclude_super_stripes(struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; u64 bytenr; u64 *logical; int stripe_len; @@ -558,7 +555,7 @@ static noinline void caching_thread(struct btrfs_work *work) caching_ctl->progress = (u64)-1; up_read(&fs_info->commit_root_sem); - free_excluded_extents(fs_info, block_group); + free_excluded_extents(block_group); mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -666,7 +663,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); - free_excluded_extents(fs_info, cache); + free_excluded_extents(cache); return 0; } } else { @@ -758,7 +755,8 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, space_info = __find_space_info(fs_info, flags); ASSERT(space_info); - percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); + percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); } /* @@ -870,18 +868,16 @@ search_again: num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif + ret = -EINVAL; + btrfs_print_v0_err(fs_info); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); + + goto out_free; } + BUG_ON(num_refs == 0); } else { num_refs = 0; @@ -1039,89 +1035,6 @@ out_free: * tree block info structure. */ -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static int convert_extent_item_v0(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - u64 owner, u32 extra_size) -{ - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_extent_item *item; - struct btrfs_extent_item_v0 *ei0; - struct btrfs_extent_ref_v0 *ref0; - struct btrfs_tree_block_info *bi; - struct extent_buffer *leaf; - struct btrfs_key key; - struct btrfs_key found_key; - u32 new_size = sizeof(*item); - u64 refs; - int ret; - - leaf = path->nodes[0]; - BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - refs = btrfs_extent_refs_v0(leaf, ei0); - - if (owner == (u64)-1) { - while (1) { - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - BUG_ON(ret > 0); /* Corruption */ - leaf = path->nodes[0]; - } - btrfs_item_key_to_cpu(leaf, &found_key, - path->slots[0]); - BUG_ON(key.objectid != found_key.objectid); - if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { - path->slots[0]++; - continue; - } - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - owner = btrfs_ref_objectid_v0(leaf, ref0); - break; - } - } - btrfs_release_path(path); - - if (owner < BTRFS_FIRST_FREE_OBJECTID) - new_size += sizeof(*bi); - - new_size -= sizeof(*ei0); - ret = btrfs_search_slot(trans, root, &key, path, - new_size + extra_size, 1); - if (ret < 0) - return ret; - BUG_ON(ret); /* Corruption */ - - btrfs_extend_item(fs_info, path, new_size); - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, item, refs); - /* FIXME: get real generation */ - btrfs_set_extent_generation(leaf, item, 0); - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - btrfs_set_extent_flags(leaf, item, - BTRFS_EXTENT_FLAG_TREE_BLOCK | - BTRFS_BLOCK_FLAG_FULL_BACKREF); - bi = (struct btrfs_tree_block_info *)(item + 1); - /* FIXME: get first key of the block */ - memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); - btrfs_set_tree_block_level(leaf, bi, (int)owner); - } else { - btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); - } - btrfs_mark_buffer_dirty(leaf); - return 0; -} -#endif - /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, * is_data == BTRFS_REF_TYPE_DATA, data type is requried, @@ -1216,13 +1129,12 @@ static int match_extent_data_ref(struct extent_buffer *leaf, } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = trans->fs_info->extent_root; struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; @@ -1251,17 +1163,6 @@ again: if (parent) { if (!ret) return 0; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - key.type = BTRFS_EXTENT_REF_V0_KEY; - btrfs_release_path(path); - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (!ret) - return 0; -#endif goto fail; } @@ -1304,13 +1205,12 @@ fail: } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = trans->fs_info->extent_root; struct btrfs_key key; struct extent_buffer *leaf; u32 size; @@ -1384,7 +1284,6 @@ fail: } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, int refs_to_drop, int *last_ref) { @@ -1406,13 +1305,10 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - num_refs = btrfs_ref_count_v0(leaf, ref0); -#endif + } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { + btrfs_print_v0_err(trans->fs_info); + btrfs_abort_transaction(trans, -EINVAL); + return -EINVAL; } else { BUG(); } @@ -1421,21 +1317,13 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs -= refs_to_drop; if (num_refs == 0) { - ret = btrfs_del_item(trans, fs_info->extent_root, path); + ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - else { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - btrfs_set_ref_count_v0(leaf, ref0, num_refs); - } -#endif btrfs_mark_buffer_dirty(leaf); } return ret; @@ -1453,6 +1341,8 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) { /* * If type is invalid, we should have bailed out earlier than @@ -1475,13 +1365,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - num_refs = btrfs_ref_count_v0(leaf, ref0); -#endif } else { WARN_ON(1); } @@ -1489,12 +1372,11 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, } static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = trans->fs_info->extent_root; struct btrfs_key key; int ret; @@ -1510,20 +1392,10 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (ret == -ENOENT && parent) { - btrfs_release_path(path); - key.type = BTRFS_EXTENT_REF_V0_KEY; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -ENOENT; - } -#endif return ret; } static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) @@ -1540,7 +1412,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, key.offset = root_objectid; } - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, + ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, path, &key, 0); btrfs_release_path(path); return ret; @@ -1599,13 +1471,13 @@ static int find_next_key(struct btrfs_path *path, int level, */ static noinline_for_stack int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int insert) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -1635,8 +1507,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, extra_size = -1; /* - * Owner is our parent level, so we can just add one to get the level - * for the block we are interested in. + * Owner is our level, so we can just add one to get the level for the + * block we are interested in. */ if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { key.type = BTRFS_METADATA_ITEM_KEY; @@ -1684,23 +1556,12 @@ again: leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - if (!insert) { - err = -ENOENT; - goto out; - } - ret = convert_extent_item_v0(trans, fs_info, path, owner, - extra_size); - if (ret < 0) { - err = ret; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (unlikely(item_size < sizeof(*ei))) { + err = -EINVAL; + btrfs_print_v0_err(fs_info); + btrfs_abort_transaction(trans, err); + goto out; } -#endif - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -1727,7 +1588,7 @@ again: iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); if (type == BTRFS_REF_TYPE_INVALID) { - err = -EINVAL; + err = -EUCLEAN; goto out; } @@ -1863,7 +1724,6 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, } static int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, @@ -1871,9 +1731,9 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, { int ret; - ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret, - bytenr, num_bytes, parent, - root_objectid, owner, offset, 0); + ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, + num_bytes, parent, root_objectid, + owner, offset, 0); if (ret != -ENOENT) return ret; @@ -1881,12 +1741,11 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, *ref_ret = NULL; if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = lookup_tree_block_ref(trans, fs_info, path, bytenr, - parent, root_objectid); + ret = lookup_tree_block_ref(trans, path, bytenr, parent, + root_objectid); } else { - ret = lookup_extent_data_ref(trans, fs_info, path, bytenr, - parent, root_objectid, owner, - offset); + ret = lookup_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset); } return ret; } @@ -1895,14 +1754,14 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -void update_inline_extent_backref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op, int *last_ref) { - struct extent_buffer *leaf; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1913,7 +1772,6 @@ void update_inline_extent_backref(struct btrfs_fs_info *fs_info, int type; u64 refs; - leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); @@ -1965,7 +1823,6 @@ void update_inline_extent_backref(struct btrfs_fs_info *fs_info, static noinline_for_stack int insert_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, @@ -1975,15 +1832,15 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; int ret; - ret = lookup_inline_extent_backref(trans, fs_info, path, &iref, - bytenr, num_bytes, parent, - root_objectid, owner, offset, 1); + ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, + num_bytes, parent, root_objectid, + owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(fs_info, path, iref, - refs_to_add, extent_op, NULL); + update_inline_extent_backref(path, iref, refs_to_add, + extent_op, NULL); } else if (ret == -ENOENT) { - setup_inline_extent_backref(fs_info, path, iref, parent, + setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1992,7 +1849,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) @@ -2000,18 +1856,17 @@ static int insert_extent_backref(struct btrfs_trans_handle *trans, int ret; if (owner < BTRFS_FIRST_FREE_OBJECTID) { BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, fs_info, path, bytenr, - parent, root_objectid); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); } else { - ret = insert_extent_data_ref(trans, fs_info, path, bytenr, - parent, root_objectid, - owner, offset, refs_to_add); + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); } return ret; } static int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data, int *last_ref) @@ -2020,14 +1875,14 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - update_inline_extent_backref(fs_info, path, iref, - -refs_to_drop, NULL, last_ref); + update_inline_extent_backref(path, iref, -refs_to_drop, NULL, + last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop, + ret = remove_extent_data_ref(trans, path, refs_to_drop, last_ref); } else { *last_ref = 1; - ret = btrfs_del_item(trans, fs_info->extent_root, path); + ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); } return ret; } @@ -2185,13 +2040,13 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, owner, offset, BTRFS_ADD_DELAYED_REF); if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, parent, root_objectid, (int)owner, BTRFS_ADD_DELAYED_REF, NULL, &old_ref_mod, &new_ref_mod); } else { - ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, offset, 0, BTRFS_ADD_DELAYED_REF, @@ -2207,8 +2062,41 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, return ret; } +/* + * __btrfs_inc_extent_ref - insert backreference for a given extent + * + * @trans: Handle of transaction + * + * @node: The delayed ref node used to get the bytenr/length for + * extent whose references are incremented. + * + * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ + * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical + * bytenr of the parent block. Since new extents are always + * created with indirect references, this will only be the case + * when relocating a shared extent. In that case, root_objectid + * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must + * be 0 + * + * @root_objectid: The id of the root where this modification has originated, + * this can be either one of the well-known metadata trees or + * the subvolume id which references this extent. + * + * @owner: For data extents it is the inode number of the owning file. + * For metadata extents this parameter holds the level in the + * tree of the extent. + * + * @offset: For metadata extents the offset is ignored and is currently + * always passed as 0. For data extents it is the fileoffset + * this extent belongs to. + * + * @refs_to_add Number of references to add + * + * @extent_op Pointer to a structure, holding information necessary when + * updating a tree block's flags + * + */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, @@ -2230,10 +2118,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, fs_info, path, bytenr, - num_bytes, parent, root_objectid, - owner, offset, - refs_to_add, extent_op); + ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, + parent, root_objectid, owner, + offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; @@ -2256,8 +2143,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, fs_info, path, bytenr, parent, - root_objectid, owner, offset, refs_to_add); + ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -2266,7 +2153,6 @@ out: } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) @@ -2283,7 +2169,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(fs_info, node, ref, node->action); + trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2292,17 +2178,16 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, fs_info, - parent, ref_root, flags, - ref->objectid, ref->offset, - &ins, node->ref_mod); + ret = alloc_reserved_file_extent(trans, parent, ref_root, + flags, ref->objectid, + ref->offset, &ins, + node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); + ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ref->objectid, ref->offset, + node->ref_mod, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, fs_info, node, parent, + ret = __btrfs_free_extent(trans, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, extent_op); @@ -2331,10 +2216,10 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -2400,18 +2285,14 @@ again: leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (unlikely(item_size < sizeof(*ei))) { + err = -EINVAL; + btrfs_print_v0_err(fs_info); + btrfs_abort_transaction(trans, err); + goto out; } -#endif - BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); @@ -2422,7 +2303,6 @@ out: } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) @@ -2433,14 +2313,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 ref_root = 0; ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; ref_root = ref->root; if (node->ref_mod != 1) { - btrfs_err(fs_info, + btrfs_err(trans->fs_info, "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); @@ -2450,13 +2330,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, node, extent_op); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, fs_info, node, - parent, ref_root, - ref->level, 0, 1, - extent_op); + ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, fs_info, node, - parent, ref_root, + ret = __btrfs_free_extent(trans, node, parent, ref_root, ref->level, 0, 1, extent_op); } else { BUG(); @@ -2466,7 +2343,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) @@ -2475,18 +2351,18 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (trans->aborted) { if (insert_reserved) - btrfs_pin_extent(fs_info, node->bytenr, + btrfs_pin_extent(trans->fs_info, node->bytenr, node->num_bytes, 1); return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, + ret = run_delayed_tree_ref(trans, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, fs_info, node, extent_op, + ret = run_delayed_data_ref(trans, node, extent_op, insert_reserved); else BUG(); @@ -2528,7 +2404,6 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref } static int cleanup_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; @@ -2542,21 +2417,22 @@ static int cleanup_extent_op(struct btrfs_trans_handle *trans, return 0; } spin_unlock(&head->lock); - ret = run_delayed_extent_op(trans, fs_info, head, extent_op); + ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head) { + + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; int ret; delayed_refs = &trans->transaction->delayed_refs; - ret = cleanup_extent_op(trans, fs_info, head); + ret = cleanup_extent_op(trans, head); if (ret < 0) { unselect_delayed_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2598,8 +2474,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, flags = BTRFS_BLOCK_GROUP_METADATA; space_info = __find_space_info(fs_info, flags); ASSERT(space_info); - percpu_counter_add(&space_info->total_bytes_pinned, - -head->num_bytes); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -head->num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); if (head->is_data) { spin_lock(&delayed_refs->lock); @@ -2705,7 +2582,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * up and move on to the next ref_head. */ if (!ref) { - ret = cleanup_ref_head(trans, fs_info, locked_ref); + ret = cleanup_ref_head(trans, locked_ref); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; @@ -2752,7 +2629,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, + ret = run_one_delayed_ref(trans, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); @@ -3227,12 +3104,6 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = 1; item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - goto out; - } -#endif ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); if (item_size != sizeof(*ei) + @@ -4060,11 +3931,7 @@ static void update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info *found; int factor; - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; + factor = btrfs_bg_type_to_factor(flags); found = __find_space_info(info, flags); ASSERT(found); @@ -4289,7 +4156,7 @@ again: if (IS_ERR(trans)) return PTR_ERR(trans); - ret = do_chunk_alloc(trans, fs_info, alloc_target, + ret = do_chunk_alloc(trans, alloc_target, CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret < 0) { @@ -4309,9 +4176,10 @@ again: * allocation, and no removed chunk in current transaction, * don't bother committing the transaction. */ - have_pinned_space = percpu_counter_compare( + have_pinned_space = __percpu_counter_compare( &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes); + used + bytes - data_sinfo->total_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ @@ -4358,7 +4226,7 @@ commit_trans: data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); - return ret; + return 0; } int btrfs_check_data_free_space(struct inode *inode, @@ -4511,9 +4379,9 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) * for allocating a chunk, otherwise if it's false, reserve space necessary for * removing a chunk. */ -void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 type) +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; u64 thresh; @@ -4552,7 +4420,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - ret = btrfs_alloc_chunk(trans, fs_info, flags); + ret = btrfs_alloc_chunk(trans, flags); } if (!ret) { @@ -4573,11 +4441,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 flags, int force) +static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + int force) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; - int wait_for_alloc = 0; + bool wait_for_alloc = false; + bool should_alloc = false; int ret = 0; /* Don't re-enter if we're already allocating a chunk */ @@ -4587,45 +4457,44 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(fs_info, flags); ASSERT(space_info); -again: - spin_lock(&space_info->lock); - if (force < space_info->force_alloc) - force = space_info->force_alloc; - if (space_info->full) { - if (should_alloc_chunk(fs_info, space_info, force)) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&space_info->lock); - return ret; - } - - if (!should_alloc_chunk(fs_info, space_info, force)) { - spin_unlock(&space_info->lock); - return 0; - } else if (space_info->chunk_alloc) { - wait_for_alloc = 1; - } else { - space_info->chunk_alloc = 1; - } - - spin_unlock(&space_info->lock); - - mutex_lock(&fs_info->chunk_mutex); + do { + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + should_alloc = should_alloc_chunk(fs_info, space_info, force); + if (space_info->full) { + /* No more free physical space */ + if (should_alloc) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } else if (!should_alloc) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + /* + * Someone is already allocating, so we need to block + * until this someone is finished and then loop to + * recheck if we should continue with our allocation + * attempt. + */ + wait_for_alloc = true; + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + /* Proceed with allocation */ + space_info->chunk_alloc = 1; + wait_for_alloc = false; + spin_unlock(&space_info->lock); + } - /* - * The chunk_mutex is held throughout the entirety of a chunk - * allocation, so once we've acquired the chunk_mutex we know that the - * other guy is done and we need to recheck and see if we should - * allocate. - */ - if (wait_for_alloc) { - mutex_unlock(&fs_info->chunk_mutex); - wait_for_alloc = 0; cond_resched(); - goto again; - } + } while (wait_for_alloc); + mutex_lock(&fs_info->chunk_mutex); trans->allocating_chunk = true; /* @@ -4651,9 +4520,9 @@ again: * Check if we have enough space in SYSTEM chunk because we may need * to update devices. */ - check_system_chunk(trans, fs_info, flags); + check_system_chunk(trans, flags); - ret = btrfs_alloc_chunk(trans, fs_info, flags); + ret = btrfs_alloc_chunk(trans, flags); trans->allocating_chunk = false; spin_lock(&space_info->lock); @@ -4703,6 +4572,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, u64 space_size; u64 avail; u64 used; + int factor; /* Don't overcommit when in mixed mode. */ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) @@ -4737,10 +4607,8 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, * doesn't include the parity drive, so we don't have to * change the math */ - if (profile & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - avail >>= 1; + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); /* * If we aren't flushing all things, let us overcommit up to @@ -4912,8 +4780,9 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return 0; /* See if there is enough pinned space to make this reservation */ - if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes) >= 0) + if (__percpu_counter_compare(&space_info->total_bytes_pinned, + bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; /* @@ -4930,8 +4799,9 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, bytes -= delayed_rsv->size; spin_unlock(&delayed_rsv->lock); - if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes) < 0) { + if (__percpu_counter_compare(&space_info->total_bytes_pinned, + bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { return -ENOSPC; } @@ -4984,7 +4854,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = do_chunk_alloc(trans, fs_info, + ret = do_chunk_alloc(trans, btrfs_metadata_alloc_profile(fs_info), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); @@ -5659,11 +5529,6 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, kfree(rsv); } -void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) -{ - kfree(rsv); -} - int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) @@ -6019,7 +5884,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; @@ -6092,7 +5957,7 @@ out_fail: void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); @@ -6121,7 +5986,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned num_extents; spin_lock(&inode->lock); @@ -6219,12 +6084,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(info, bytenr); if (!cache) return -ENOENT; - if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; + factor = btrfs_bg_type_to_factor(cache->flags); + /* * If this block group has free space cache written out, we * need to make sure to load it if we are removing space. This @@ -6268,8 +6129,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); - percpu_counter_add(&cache->space_info->total_bytes_pinned, - num_bytes); + percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, + num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6279,7 +6141,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (list_empty(&cache->dirty_list)) { list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); - trans->transaction->num_dirty_bgs++; + trans->transaction->num_dirty_bgs++; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6290,16 +6152,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, * dirty list to avoid races between cleaner kthread and space * cache writeout. */ - if (!alloc && old_val == 0) { - spin_lock(&info->unused_bgs_lock); - if (list_empty(&cache->bg_list)) { - btrfs_get_block_group(cache); - trace_btrfs_add_unused_block_group(cache); - list_add_tail(&cache->bg_list, - &info->unused_bgs); - } - spin_unlock(&info->unused_bgs_lock); - } + if (!alloc && old_val == 0) + btrfs_mark_bg_unused(cache); btrfs_put_block_group(cache); total -= num_bytes; @@ -6347,7 +6201,8 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); - percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes); + percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, + num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -6711,7 +6566,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); space_info->max_extent_size = 0; - percpu_counter_add(&space_info->total_bytes_pinned, -len); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -6815,12 +6671,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *info, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_ref_node *node, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; struct btrfs_root *extent_root = info->extent_root; @@ -6852,9 +6708,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) skinny_metadata = false; - ret = lookup_extent_backref(trans, info, path, &iref, - bytenr, num_bytes, parent, - root_objectid, owner_objectid, + ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, + parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { extent_slot = path->slots[0]; @@ -6877,14 +6732,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; extent_slot--; } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); - if (found_extent && item_size < sizeof(*ei)) - found_extent = 0; -#endif + if (!found_extent) { BUG_ON(iref); - ret = remove_extent_backref(trans, info, path, NULL, + ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); if (ret) { @@ -6957,42 +6808,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - BUG_ON(found_extent || extent_slot != path->slots[0]); - ret = convert_extent_item_v0(trans, info, path, owner_objectid, - 0); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - btrfs_release_path(path); - path->leave_spinning = 1; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - ret = btrfs_search_slot(trans, extent_root, &key, path, - -1, 1); - if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); - btrfs_print_leaf(path->nodes[0]); - } - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - extent_slot = path->slots[0]; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); + if (unlikely(item_size < sizeof(*ei))) { + ret = -EINVAL; + btrfs_print_v0_err(info); + btrfs_abort_transaction(trans, ret); + goto out; } -#endif - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && @@ -7028,9 +6849,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); } if (found_extent) { - ret = remove_extent_backref(trans, info, path, - iref, refs_to_drop, - is_data, &last_ref); + ret = remove_extent_backref(trans, path, iref, + refs_to_drop, is_data, + &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7172,7 +6993,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, root->root_key.objectid, btrfs_header_level(buf), 0, BTRFS_DROP_DELAYED_REF); - ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), @@ -7251,13 +7072,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, old_ref_mod = new_ref_mod = 0; ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL, &old_ref_mod, &new_ref_mod); } else { - ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, offset, 0, BTRFS_DROP_DELAYED_REF, @@ -7534,7 +7355,7 @@ search: * for the proper type. */ if (!block_group_bits(block_group, flags)) { - u64 extra = BTRFS_BLOCK_GROUP_DUP | + u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | @@ -7738,7 +7559,7 @@ unclustered_alloc: goto loop; } checks: - search_start = ALIGN(offset, fs_info->stripesize); + search_start = round_up(offset, fs_info->stripesize); /* move on to the next group */ if (search_start + num_bytes > @@ -7750,7 +7571,6 @@ checks: if (offset < search_start) btrfs_add_free_space(block_group, offset, search_start - offset); - BUG_ON(offset > search_start); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); @@ -7826,8 +7646,7 @@ loop: goto out; } - ret = do_chunk_alloc(trans, fs_info, flags, - CHUNK_ALLOC_FORCE); + ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE); /* * If we can't allocate a new chunk we've already looped @@ -8053,11 +7872,11 @@ int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_inline_ref *iref; @@ -8231,7 +8050,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -8240,7 +8058,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, root->root_key.objectid, owner, offset, BTRFS_ADD_DELAYED_EXTENT); - ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 0, root->root_key.objectid, owner, offset, ram_bytes, @@ -8254,10 +8072,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, * space cache bits as well */ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; @@ -8285,15 +8103,15 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); - ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid, - 0, owner, offset, ins, 1); + ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, + offset, ins, 1); btrfs_put_block_group(block_group); return ret; } static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level) + u64 bytenr, int level, u64 owner) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -8302,7 +8120,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(fs_info, buf); @@ -8311,6 +8128,14 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_lock_blocking(buf); set_extent_buffer_uptodate(buf); + memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); + btrfs_set_header_level(buf, level); + btrfs_set_header_bytenr(buf, buf->start); + btrfs_set_header_generation(buf, trans->transid); + btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(buf, owner); + write_extent_buffer_fsid(buf, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* @@ -8419,7 +8244,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level); + level, root_objectid); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -8435,7 +8260,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (ret) goto out_unuse; - buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, + root_objectid); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; @@ -8467,7 +8293,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, root_objectid, level, 0, BTRFS_ADD_DELAYED_EXTENT); - ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, @@ -8499,7 +8325,6 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; - int for_reloc; }; #define DROP_REFERENCE 1 @@ -8819,7 +8644,7 @@ skip: } if (need_account) { - ret = btrfs_qgroup_trace_subtree(trans, root, next, + ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { btrfs_err_rl(fs_info, @@ -8919,7 +8744,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb); + ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, "error %d accounting leaf items. Quota is out of sync, rescan required.", @@ -9136,7 +8961,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; - wc->for_reloc = for_reloc; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { @@ -9199,7 +9023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - ret = btrfs_del_root(trans, fs_info, &root->root_key); + ret = btrfs_del_root(trans, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); err = ret; @@ -9302,7 +9126,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; - wc->for_reloc = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { @@ -9417,10 +9240,10 @@ out: return ret; } -int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache) +int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; u64 alloc_flags; int ret; @@ -9454,7 +9277,7 @@ again: */ alloc_flags = update_block_group_flags(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, fs_info, alloc_flags, + ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -9471,8 +9294,7 @@ again: if (!ret) goto out; alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = do_chunk_alloc(trans, fs_info, alloc_flags, - CHUNK_ALLOC_FORCE); + ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); @@ -9480,7 +9302,7 @@ out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(fs_info, cache->flags); mutex_lock(&fs_info->chunk_mutex); - check_system_chunk(trans, fs_info, alloc_flags); + check_system_chunk(trans, alloc_flags); mutex_unlock(&fs_info->chunk_mutex); } mutex_unlock(&fs_info->ro_block_group_mutex); @@ -9489,12 +9311,11 @@ out: return ret; } -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 type) +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { - u64 alloc_flags = get_alloc_profile(fs_info, type); + u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); + return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } /* @@ -9520,13 +9341,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) continue; } - if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) - factor = 2; - else - factor = 1; - + factor = btrfs_bg_type_to_factor(block_group->flags); free_bytes += (block_group->key.offset - btrfs_block_group_used(&block_group->item)) * factor; @@ -9717,6 +9532,8 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, int ret = 0; struct btrfs_key found_key; struct extent_buffer *leaf; + struct btrfs_block_group_item bg; + u64 flags; int slot; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); @@ -9751,8 +9568,32 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, "logical %llu len %llu found bg but no related chunk", found_key.objectid, found_key.offset); ret = -ENOENT; + } else if (em->start != found_key.objectid || + em->len != found_key.offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + found_key.objectid, found_key.offset, + em->start, em->len); + ret = -EUCLEAN; } else { - ret = 0; + read_extent_buffer(leaf, &bg, + btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & + BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + found_key.objectid, + found_key.offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & + em->map_lookup->type)); + ret = -EUCLEAN; + } else { + ret = 0; + } } free_extent_map(em); goto out; @@ -9847,7 +9688,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ if (block_group->cached == BTRFS_CACHE_NO || block_group->cached == BTRFS_CACHE_ERROR) - free_excluded_extents(info, block_group); + free_excluded_extents(block_group); btrfs_remove_free_space_cache(block_group); ASSERT(block_group->cached != BTRFS_CACHE_STARTED); @@ -10003,6 +9844,62 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, return cache; } + +/* + * Iterate all chunks and verify that each of them has the corresponding block + * group + */ +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) +{ + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct btrfs_block_group_cache *bg; + u64 start = 0; + int ret = 0; + + while (1) { + read_lock(&map_tree->map_tree.lock); + /* + * lookup_extent_mapping will return the first extent map + * intersecting the range, so setting @len to 1 is enough to + * get the first chunk. + */ + em = lookup_extent_mapping(&map_tree->map_tree, start, 1); + read_unlock(&map_tree->map_tree.lock); + if (!em) + break; + + bg = btrfs_lookup_block_group(fs_info, em->start); + if (!bg) { + btrfs_err(fs_info, + "chunk start=%llu len=%llu doesn't have corresponding block group", + em->start, em->len); + ret = -EUCLEAN; + free_extent_map(em); + break; + } + if (bg->key.objectid != em->start || + bg->key.offset != em->len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", + em->start, em->len, + em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + bg->key.objectid, bg->key.offset, + bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ret = -EUCLEAN; + free_extent_map(em); + btrfs_put_block_group(bg); + break; + } + start = em->start + em->len; + free_extent_map(em); + btrfs_put_block_group(bg); + } + return ret; +} + int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_path *path; @@ -10089,13 +9986,13 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - ret = exclude_super_stripes(info, cache); + ret = exclude_super_stripes(cache); if (ret) { /* * We may have excluded something, so call this just in * case. */ - free_excluded_extents(info, cache); + free_excluded_extents(cache); btrfs_put_block_group(cache); goto error; } @@ -10110,14 +10007,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) if (found_key.offset == btrfs_block_group_used(&cache->item)) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - free_excluded_extents(info, cache); + free_excluded_extents(cache); } else if (btrfs_block_group_used(&cache->item) == 0) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, found_key.objectid, found_key.objectid + found_key.offset); - free_excluded_extents(info, cache); + free_excluded_extents(cache); } ret = btrfs_add_block_group_cache(info, cache); @@ -10140,15 +10037,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) if (btrfs_chunk_readonly(info, cache->key.objectid)) { inc_block_group_ro(cache, 1); } else if (btrfs_block_group_used(&cache->item) == 0) { - spin_lock(&info->unused_bgs_lock); - /* Should always be true but just in case. */ - if (list_empty(&cache->bg_list)) { - btrfs_get_block_group(cache); - trace_btrfs_add_unused_block_group(cache); - list_add_tail(&cache->bg_list, - &info->unused_bgs); - } - spin_unlock(&info->unused_bgs_lock); + ASSERT(list_empty(&cache->bg_list)); + btrfs_mark_bg_unused(cache); } } @@ -10176,7 +10066,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_add_raid_kobjects(info); init_global_block_rsv(info); - ret = 0; + ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); return ret; @@ -10206,8 +10096,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) sizeof(item)); if (ret) btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid, - key.offset); + ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); @@ -10218,10 +10107,10 @@ next: trans->can_flush_pending_bgs = can_flush_pending_bgs; } -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytes_used, +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache; int ret; @@ -10240,20 +10129,20 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->needs_free_space = 1; - ret = exclude_super_stripes(fs_info, cache); + ret = exclude_super_stripes(cache); if (ret) { /* * We may have excluded something, so call this just in * case. */ - free_excluded_extents(fs_info, cache); + free_excluded_extents(cache); btrfs_put_block_group(cache); return ret; } add_new_free_space(cache, chunk_offset, chunk_offset + size); - free_excluded_extents(fs_info, cache); + free_excluded_extents(cache); #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(cache)) { @@ -10311,9 +10200,9 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 group_start, - struct extent_map *em) + u64 group_start, struct extent_map *em) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -10337,18 +10226,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * Free the reserved super bytes from this block group before * remove it. */ - free_excluded_extents(fs_info, block_group); + free_excluded_extents(block_group); btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, block_group->key.offset); memcpy(&key, &block_group->key, sizeof(key)); index = btrfs_bg_flags_to_raid_index(block_group->flags); - if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; + factor = btrfs_bg_type_to_factor(block_group->flags); /* make sure this block group isn't part of an allocation cluster */ cluster = &fs_info->data_alloc_cluster; @@ -10687,7 +10571,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); - if (block_group->reserved || + if (block_group->reserved || block_group->pinned || btrfs_block_group_used(&block_group->item) || block_group->ro || list_is_singular(&block_group->list)) { @@ -10764,8 +10648,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) space_info->bytes_pinned -= block_group->pinned; space_info->bytes_readonly += block_group->pinned; - percpu_counter_add(&space_info->total_bytes_pinned, - -block_group->pinned); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -block_group->pinned, + BTRFS_TOTAL_BYTES_PINNED_BATCH); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -10782,8 +10667,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ - ret = btrfs_remove_chunk(trans, fs_info, - block_group->key.objectid); + ret = btrfs_remove_chunk(trans, block_group->key.objectid); if (ret) { if (trimming) @@ -11066,3 +10950,16 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) !atomic_read(&root->will_be_snapshotted)); } } + +void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cce6087d6880..628f1aef34b0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -140,14 +140,6 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, static void flush_write_bio(struct extent_page_data *epd); -static inline struct btrfs_fs_info * -tree_fs_info(struct extent_io_tree *tree) -{ - if (tree->ops) - return tree->ops->tree_fs_info(tree->private_data); - return NULL; -} - int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", @@ -564,8 +556,10 @@ alloc_extent_state_atomic(struct extent_state *prealloc) static void extent_io_tree_panic(struct extent_io_tree *tree, int err) { - btrfs_panic(tree_fs_info(tree), err, - "Locking error: Extent tree was modified by another thread while locked."); + struct inode *inode = tree->private_data; + + btrfs_panic(btrfs_sb(inode->i_sb), err, + "locking error: extent tree was modified by another thread while locked"); } /* @@ -1386,14 +1380,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) } } -/* - * helper function to set both pages and extents in the tree writeback - */ -static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) -{ - tree->ops->set_range_writeback(tree->private_data, start, end); -} - /* find the first state struct with 'bits' set after 'start', and * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' @@ -2059,7 +2045,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int mirror_num) { u64 start = eb->start; - unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int i, num_pages = num_extent_pages(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) @@ -2398,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, start - page_offset(page), (int)phy_offset, failed_bio->bi_end_io, NULL); - bio_set_op_attrs(bio, REQ_OP_READ, read_mode); + bio->bi_opf = REQ_OP_READ | read_mode; btrfs_debug(btrfs_sb(inode->i_sb), "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", @@ -2790,8 +2776,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, else contig = bio_end_sector(bio) == sector; - if (tree->ops && tree->ops->merge_bio_hook(page, offset, - page_size, bio, bio_flags)) + if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, + bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -3422,7 +3408,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - set_range_writeback(tree, cur, cur + iosize - 1); + btrfs_set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "page %lu not writeback, cur %llu end %llu", @@ -3538,7 +3524,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, struct btrfs_fs_info *fs_info, struct extent_page_data *epd) { - unsigned long i, num_pages; + int i, num_pages; int flush = 0; int ret = 0; @@ -3588,7 +3574,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!ret) return ret; - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -3712,13 +3698,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; - unsigned long i, num_pages; + int i, num_pages; unsigned long start, end; unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); atomic_set(&eb->io_pages, num_pages); /* set btree blocks beyond nritems with 0 to avoid stale content. */ @@ -4238,8 +4224,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; - struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree; + struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &btrfs_inode->io_tree; + struct extent_map_tree *map = &btrfs_inode->extent_tree; if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > SZ_16M) { @@ -4262,6 +4249,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) extent_map_end(em) - 1, EXTENT_LOCKED | EXTENT_WRITEBACK, 0, NULL)) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &btrfs_inode->runtime_flags); remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); @@ -4542,8 +4531,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - disko = em->block_start + offset_in_extent; flags = 0; + if (em->block_start < EXTENT_MAP_LAST_BYTE) + disko = em->block_start + offset_in_extent; + else + disko = 0; /* * bump off for our next call to get_extent @@ -4637,23 +4629,20 @@ int extent_buffer_under_io(struct extent_buffer *eb) } /* - * Helper for releasing extent buffer page. + * Release all pages attached to the extent buffer. */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) { - unsigned long index; - struct page *page; - int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + int i; + int num_pages; + int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); BUG_ON(extent_buffer_under_io(eb)); - index = num_extent_pages(eb->start, eb->len); - if (index == 0) - return; + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; - do { - index--; - page = eb->pages[index]; if (!page) continue; if (mapped) @@ -4685,7 +4674,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) /* One for when we allocated the page */ put_page(page); - } while (index != 0); + } } /* @@ -4693,7 +4682,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_page(eb); + btrfs_release_extent_buffer_pages(eb); __free_extent_buffer(eb); } @@ -4737,10 +4726,10 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) { - unsigned long i; + int i; struct page *p; struct extent_buffer *new; - unsigned long num_pages = num_extent_pages(src->start, src->len); + int num_pages = num_extent_pages(src); new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -4760,7 +4749,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) } set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); - set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); return new; } @@ -4769,15 +4758,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { struct extent_buffer *eb; - unsigned long num_pages; - unsigned long i; - - num_pages = num_extent_pages(start, len); + int num_pages; + int i; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) @@ -4785,7 +4773,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, } set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); - set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: @@ -4837,11 +4825,11 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb, struct page *accessed) { - unsigned long num_pages, i; + int num_pages, i; check_buffer_tree_ref(eb); - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4938,8 +4926,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { unsigned long len = fs_info->nodesize; - unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; + int num_pages; + int i; unsigned long index = start >> PAGE_SHIFT; struct extent_buffer *eb; struct extent_buffer *exists = NULL; @@ -4961,6 +4949,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { @@ -5003,8 +4992,11 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, uptodate = 0; /* - * see below about how we avoid a nasty race with release page - * and why we unlock later + * We can't unlock the pages just yet since the extent buffer + * hasn't been properly inserted in the radix tree, this + * opens a race with btree_releasepage which can free a page + * while we are still filling in all pages for the buffer and + * we could crash. */ } if (uptodate) @@ -5033,21 +5025,12 @@ again: set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* - * there is a race where release page may have - * tried to find this extent buffer in the radix - * but failed. It will tell the VM it is safe to - * reclaim the, and it will clear the page private bit. - * We must make sure to set the page private bit properly - * after the extent buffer is in the radix tree so - * it doesn't get lost + * Now it's safe to unlock the pages because any calls to + * btree_releasepage will correctly detect that a page belongs to a + * live buffer and won't free them prematurely. */ - SetPageChecked(eb->pages[0]); - for (i = 1; i < num_pages; i++) { - p = eb->pages[i]; - ClearPageChecked(p); - unlock_page(p); - } - unlock_page(eb->pages[0]); + for (i = 0; i < num_pages; i++) + unlock_page(eb->pages[i]); return eb; free_eb: @@ -5069,9 +5052,10 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) __free_extent_buffer(eb); } -/* Expects to have eb->eb_lock already held */ static int release_extent_buffer(struct extent_buffer *eb) { + lockdep_assert_held(&eb->refs_lock); + WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { @@ -5088,9 +5072,9 @@ static int release_extent_buffer(struct extent_buffer *eb) } /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_page(eb); + btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { + if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { __free_extent_buffer(eb); return 1; } @@ -5121,7 +5105,7 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) atomic_dec(&eb->refs); if (atomic_read(&eb->refs) == 2 && @@ -5153,11 +5137,11 @@ void free_extent_buffer_stale(struct extent_buffer *eb) void clear_extent_buffer_dirty(struct extent_buffer *eb) { - unsigned long i; - unsigned long num_pages; + int i; + int num_pages; struct page *page; - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -5183,15 +5167,15 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) int set_extent_buffer_dirty(struct extent_buffer *eb) { - unsigned long i; - unsigned long num_pages; + int i; + int num_pages; int was_dirty = 0; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); @@ -5202,12 +5186,12 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - unsigned long i; + int i; struct page *page; - unsigned long num_pages; + int num_pages; clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (page) @@ -5217,12 +5201,12 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) void set_extent_buffer_uptodate(struct extent_buffer *eb) { - unsigned long i; + int i; struct page *page; - unsigned long num_pages; + int num_pages; set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; SetPageUptodate(page); @@ -5232,13 +5216,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, int wait, int mirror_num) { - unsigned long i; + int i; struct page *page; int err; int ret = 0; int locked_pages = 0; int all_uptodate = 1; - unsigned long num_pages; + int num_pages; unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; @@ -5246,7 +5230,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - num_pages = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (wait == WAIT_NONE) { @@ -5570,11 +5554,11 @@ void copy_extent_buffer_full(struct extent_buffer *dst, struct extent_buffer *src) { int i; - unsigned num_pages; + int num_pages; ASSERT(dst->len == src->len); - num_pages = num_extent_pages(dst->start, dst->len); + num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), page_address(src->pages[i])); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0bfd4aeb822d..b4d03e677e1d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -46,7 +46,7 @@ #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ -#define EXTENT_BUFFER_DUMMY 9 +#define EXTENT_BUFFER_UNMAPPED 9 #define EXTENT_BUFFER_IN_TREE 10 #define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ @@ -92,9 +92,6 @@ typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio * typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -typedef blk_status_t (extent_submit_bio_done_t)(void *private_data, - struct bio *bio, int mirror_num); - struct extent_io_ops { /* * The following callbacks must be allways defined, the function @@ -104,12 +101,7 @@ struct extent_io_ops { int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); - int (*merge_bio_hook)(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - struct btrfs_fs_info *(*tree_fs_info)(void *private_data); - void (*set_range_writeback)(void *private_data, u64 start, u64 end); /* * Optional hooks, called if the pointer is not NULL @@ -440,10 +432,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); -static inline unsigned long num_extent_pages(u64 start, u64 len) +static inline int num_extent_pages(const struct extent_buffer *eb) { - return ((start + len + PAGE_SIZE - 1) >> PAGE_SHIFT) - - (start >> PAGE_SHIFT); + return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) - + (eb->start >> PAGE_SHIFT); } static inline void extent_buffer_get(struct extent_buffer *eb) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index f9dd6d1836a3..ba74827beb32 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -922,7 +922,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const bool new_inline, struct extent_map *em) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; @@ -942,7 +942,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, btrfs_file_extent_num_bytes(leaf, fi); } else if (type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, slot, fi); + size = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 51e77d72068a..2be00e873e92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -5,14 +5,11 @@ #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> -#include <linux/mpage.h> #include <linux/falloc.h> -#include <linux/swap.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/slab.h> @@ -83,7 +80,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; @@ -135,8 +132,8 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; u64 transid; int ret; @@ -185,7 +182,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; if (!__need_auto_defrag(fs_info)) @@ -833,8 +830,7 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, - path->slots[0], fi); + btrfs_file_extent_ram_bytes(leaf, fi); } else { /* can't happen */ BUG(); @@ -1133,7 +1129,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_path *path; @@ -1470,7 +1466,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, u64 *lockstart, u64 *lockend, struct extent_state **cached_state) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; int i; @@ -1526,7 +1522,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; @@ -1569,10 +1565,11 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, return ret; } -static noinline ssize_t __btrfs_buffered_write(struct file *file, - struct iov_iter *i, - loff_t pos) +static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, + struct iov_iter *i) { + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1804,7 +1801,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - loff_t pos = iocb->ki_pos; + loff_t pos; ssize_t written; ssize_t written_buffered; loff_t endbyte; @@ -1815,8 +1812,8 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from)) return written; - pos += written; - written_buffered = __btrfs_buffered_write(file, from, pos); + pos = iocb->ki_pos; + written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1953,7 +1950,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_DIRECT) { num_written = __btrfs_direct_write(iocb, from); } else { - num_written = __btrfs_buffered_write(file, from, pos); + num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) iocb->ki_pos = pos + num_written; if (clean_page) @@ -2042,7 +2039,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - bool full_sync = false; u64 len; /* @@ -2066,96 +2062,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) inode_lock(inode); atomic_inc(&root->log_batch); - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + /* - * We might have have had more pages made dirty after calling - * start_ordered_ops and before acquiring the inode's i_mutex. + * We have to do this here to avoid the priority inversion of waiting on + * IO of a lower priority task while holding a transaciton open. */ - if (full_sync) { - /* - * For a full sync, we need to make sure any ordered operations - * start and finish before we start logging the inode, so that - * all extents are persisted and the respective file extent - * items are in the fs/subvol btree. - */ - ret = btrfs_wait_ordered_range(inode, start, len); - } else { - /* - * Start any new ordered operations before starting to log the - * inode. We will wait for them to finish in btrfs_sync_log(). - * - * Right before acquiring the inode's mutex, we might have new - * writes dirtying pages, which won't immediately start the - * respective ordered operations - that is done through the - * fill_delalloc callbacks invoked from the writepage and - * writepages address space operations. So make sure we start - * all ordered operations before starting to log our inode. Not - * doing this means that while logging the inode, writeback - * could start and invoke writepage/writepages, which would call - * the fill_delalloc callbacks (cow_file_range, - * submit_compressed_extents). These callbacks add first an - * extent map to the modified list of extents and then create - * the respective ordered operation, which means in - * tree-log.c:btrfs_log_inode() we might capture all existing - * ordered operations (with btrfs_get_logged_extents()) before - * the fill_delalloc callback adds its ordered operation, and by - * the time we visit the modified list of extent maps (with - * btrfs_log_changed_extents()), we see and process the extent - * map they created. We then use the extent map to construct a - * file extent item for logging without waiting for the - * respective ordered operation to finish - this file extent - * item points to a disk location that might not have yet been - * written to, containing random data - so after a crash a log - * replay will make our inode have file extent items that point - * to disk locations containing invalid data, as we returned - * success to userspace without waiting for the respective - * ordered operation to finish, because it wasn't captured by - * btrfs_get_logged_extents(). - */ - ret = start_ordered_ops(inode, start, end); - } + ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); - /* - * If the last transaction that changed this file was before the current - * transaction and we have the full sync flag set in our inode, we can - * bail out now without any syncing. - * - * Note that we can't bail out if the full sync flag isn't set. This is - * because when the full sync flag is set we start all ordered extents - * and wait for them to fully complete - when they complete they update - * the inode's last_trans field through: - * - * btrfs_finish_ordered_io() -> - * btrfs_update_inode_fallback() -> - * btrfs_update_inode() -> - * btrfs_set_inode_last_trans() - * - * So we are sure that last_trans is up to date and can do this check to - * bail out safely. For the fast path, when the full sync flag is not - * set in our inode, we can not do it because we start only our ordered - * extents and don't wait for them to complete (that is when - * btrfs_finish_ordered_io runs), so here at this point their last_trans - * value might be less than or equals to fs_info->last_trans_committed, - * and setting a speculative last_trans for an inode when a buffered - * write is made (such as fs_info->generation + 1 for example) would not - * be reliable since after setting the value and before fsync is called - * any number of transactions can start and commit (transaction kthread - * commits the current transaction periodically), and a transaction - * commit does not start nor waits for ordered extents to complete. - */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - fs_info->last_trans_committed) || - (!btrfs_have_ordered_extents_in_range(inode, start, len) && - BTRFS_I(inode)->last_trans - <= fs_info->last_trans_committed)) { + BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2239,13 +2160,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } - } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2310,7 +2224,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d5f80cb300be..0adf38b00fa0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -71,10 +71,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, inode = btrfs_iget(fs_info->sb, &location, root, NULL); if (IS_ERR(inode)) return inode; - if (is_bad_inode(inode)) { - iput(inode); - return ERR_PTR(-ENOENT); - } mapping_set_gfp_mask(inode->i_mapping, mapping_gfp_constraint(inode->i_mapping, @@ -300,9 +296,9 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID) check_crcs = 1; - /* Make sure we can fit our crcs into the first page */ + /* Make sure we can fit our crcs and generation into the first page */ if (write && check_crcs && - (num_pages * sizeof(u32)) >= PAGE_SIZE) + (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); @@ -547,7 +543,7 @@ static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) io_ctl_map_page(io_ctl, 0); } - memcpy(io_ctl->cur, bitmap, PAGE_SIZE); + copy_page(io_ctl->cur, bitmap); io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); @@ -607,7 +603,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, if (ret) return ret; - memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE); + copy_page(entry->bitmap, io_ctl->cur); io_ctl_unmap_page(io_ctl); return 0; @@ -655,7 +651,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct btrfs_io_ctl io_ctl; @@ -1123,13 +1119,10 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, { int ret; struct inode *inode = io_ctl->inode; - struct btrfs_fs_info *fs_info; if (!inode) return 0; - fs_info = btrfs_sb(inode->i_sb); - /* Flush the dirty pages in the cache file. */ ret = flush_dirty_cache(inode); if (ret) @@ -1145,7 +1138,7 @@ out: BTRFS_I(inode)->generation = 0; if (block_group) { #ifdef DEBUG - btrfs_err(fs_info, + btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->key.objectid); #endif diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index b5950aacd697..d6736595ec57 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1236,7 +1236,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) if (ret) goto abort; - ret = btrfs_del_root(trans, fs_info, &free_space_root->root_key); + ret = btrfs_del_root(trans, &free_space_root->root_key); if (ret) goto abort; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 12fcd8897c33..ffca2abf13d0 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -3,7 +3,6 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include <linux/delay.h> #include <linux/kthread.h> #include <linux/pagemap.h> @@ -244,8 +243,6 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) return; while (1) { - bool add_to_ctl = true; - spin_lock(rbroot_lock); n = rb_first(rbroot); if (!n) { @@ -257,15 +254,14 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->ino_cache_progress) - add_to_ctl = false; - else if (info->offset + info->bytes > root->ino_cache_progress) - count = root->ino_cache_progress - info->offset + 1; + count = 0; else - count = info->bytes; + count = min(root->ino_cache_progress - info->offset + 1, + info->bytes); rb_erase(&info->offset_index, rbroot); spin_unlock(rbroot_lock); - if (add_to_ctl) + if (count) __btrfs_add_free_space(root->fs_info, ctl, info->offset, count); kmem_cache_free(btrfs_free_space_cachep, info); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e9482f0db9d0..9357a19d2bff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -14,17 +14,13 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> -#include <linux/mpage.h> -#include <linux/swap.h> #include <linux/writeback.h> #include <linux/compat.h> -#include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> -#include <linux/mount.h> #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> @@ -1443,8 +1439,7 @@ next_slot: nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, - path->slots[0], fi); + btrfs_file_extent_ram_bytes(leaf, fi); extent_end = ALIGN(extent_end, fs_info->sectorsize); } else { @@ -1752,7 +1747,7 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = root->fs_info; if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); @@ -1903,8 +1898,8 @@ static void btrfs_clear_bit_hook(void *private_data, } /* - * extent_io.c merge_bio_hook, this must check the chunk tree to make sure - * we don't create bios that span stripes or chunks + * Merge bio hook, this must check the chunk tree to make sure we don't create + * bios that span stripes or chunks * * return 1 if page cannot be merged to bio * return 0 if page can be merged to bio @@ -1962,7 +1957,7 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, +blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num) { struct inode *inode = private_data; @@ -2035,8 +2030,7 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, bio_offset, inode, - btrfs_submit_bio_start, - btrfs_submit_bio_done); + btrfs_submit_bio_start); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, 0, 0); @@ -3610,18 +3604,15 @@ static int btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto make_bad; - } + if (!path) + return -ENOMEM; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (ret > 0) - ret = -ENOENT; - goto make_bad; + btrfs_free_path(path); + return ret; } leaf = path->nodes[0]; @@ -3774,11 +3765,6 @@ cache_acl: btrfs_sync_inode_flags_to_i_flags(inode); return 0; - -make_bad: - btrfs_free_path(path); - make_bad_inode(inode); - return ret; } /* @@ -3984,7 +3970,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } skip_backref: - ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index); + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; @@ -4087,11 +4073,10 @@ out: } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, u64 objectid, - const char *name, int name_len) + struct inode *dir, u64 objectid, + const char *name, int name_len) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; @@ -4124,9 +4109,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_del_root_ref(trans, fs_info, objectid, - root->root_key.objectid, dir_ino, - &index, name, name_len); + ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, + dir_ino, &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { btrfs_abort_transaction(trans, ret); @@ -4145,12 +4129,11 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(path); index = key.offset; } btrfs_release_path(path); - ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), index); + ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4243,9 +4226,9 @@ again: prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); - if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode))) + if (objectid < btrfs_ino(entry)) node = node->rb_left; - else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode))) + else if (objectid > btrfs_ino(entry)) node = node->rb_right; else break; @@ -4253,7 +4236,7 @@ again: if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) { + if (objectid <= btrfs_ino(entry)) { node = prev; break; } @@ -4262,7 +4245,7 @@ again: } while (node) { entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1; + objectid = btrfs_ino(entry) + 1; inode = igrab(&entry->vfs_inode); if (inode) { spin_unlock(&root->inode_lock); @@ -4343,10 +4326,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); - ret = btrfs_unlink_subvol(trans, root, dir, - dest->root_key.objectid, - dentry->d_name.name, - dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid, + dentry->d_name.name, dentry->d_name.len); if (ret) { err = ret; btrfs_abort_transaction(trans, ret); @@ -4441,7 +4422,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(trans); if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, root, dir, + err = btrfs_unlink_subvol(trans, dir, BTRFS_I(inode)->location.objectid, dentry->d_name.name, dentry->d_name.len); @@ -4643,8 +4624,8 @@ search_again: BTRFS_I(inode), leaf, fi, found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_inline_len(leaf, - path->slots[0], fi); + item_end += btrfs_file_extent_ram_bytes(leaf, + fi); trace_btrfs_truncate_show_fi_inline( BTRFS_I(inode), leaf, fi, path->slots[0], @@ -5615,9 +5596,9 @@ static void inode_tree_add(struct inode *inode) parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); - if (ino < btrfs_ino(BTRFS_I(&entry->vfs_inode))) + if (ino < btrfs_ino(entry)) p = &parent->rb_left; - else if (ino > btrfs_ino(BTRFS_I(&entry->vfs_inode))) + else if (ino > btrfs_ino(entry)) p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & @@ -5708,16 +5689,21 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, int ret; ret = btrfs_read_locked_inode(inode); - if (!is_bad_inode(inode)) { + if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); if (new) *new = 1; } else { - unlock_new_inode(inode); - iput(inode); - ASSERT(ret < 0); - inode = ERR_PTR(ret < 0 ? ret : -ESTALE); + iget_failed(inode); + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_read_locked_inode, this means the inode item + * was not found. + */ + if (ret > 0) + ret = -ENOENT; + inode = ERR_PTR(ret); } } @@ -5745,7 +5731,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime); + BTRFS_I(inode)->i_otime = inode->i_mtime; return inode; } @@ -6027,32 +6013,6 @@ err: return ret; } -int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret = 0; - bool nolock = false; - - if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) - return 0; - - if (btrfs_fs_closing(root->fs_info) && - btrfs_is_free_space_inode(BTRFS_I(inode))) - nolock = true; - - if (wbc->sync_mode == WB_SYNC_ALL) { - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans); - } - return ret; -} - /* * This is somewhat expensive, updating the tree every time the * inode changes. But, it is most likely to find the inode in cache. @@ -6335,8 +6295,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location->type = BTRFS_INODE_ITEM_KEY; ret = btrfs_insert_inode_locked(inode); - if (ret < 0) + if (ret < 0) { + iput(inode); goto fail; + } path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); @@ -6349,7 +6311,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime); + BTRFS_I(inode)->i_otime = inode->i_mtime; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -6395,12 +6357,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return inode; fail_unlock: - unlock_new_inode(inode); + discard_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); - iput(inode); return ERR_PTR(ret); } @@ -6419,7 +6380,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret = 0; struct btrfs_key key; struct btrfs_root *root = parent_inode->root; @@ -6435,7 +6395,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_add_root_ref(trans, fs_info, key.objectid, + ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, index, name, name_len); } else if (add_backref) { @@ -6471,7 +6431,7 @@ fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; int err; - err = btrfs_del_root_ref(trans, fs_info, key.objectid, + err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); @@ -6505,7 +6465,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; int err; - int drop_inode = 0; u64 objectid; u64 index = 0; @@ -6527,6 +6486,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); + inode = NULL; goto out_unlock; } @@ -6541,31 +6501,24 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_unlock_inode; + goto out_unlock; err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); - if (err) { - goto out_unlock_inode; - } else { - btrfs_update_inode(trans, root, inode); - d_instantiate_new(dentry, inode); - } + if (err) + goto out_unlock; + + btrfs_update_inode(trans, root, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (drop_inode) { + if (err && inode) { inode_dec_link_count(inode); - iput(inode); + discard_new_inode(inode); } return err; - -out_unlock_inode: - drop_inode = 1; - unlock_new_inode(inode); - goto out_unlock; - } static int btrfs_create(struct inode *dir, struct dentry *dentry, @@ -6575,7 +6528,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int drop_inode_on_err = 0; int err; u64 objectid; u64 index = 0; @@ -6598,9 +6550,9 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); + inode = NULL; goto out_unlock; } - drop_inode_on_err = 1; /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -6613,33 +6565,28 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_unlock_inode; + goto out_unlock; err = btrfs_update_inode(trans, root, inode); if (err) - goto out_unlock_inode; + goto out_unlock; err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); if (err) - goto out_unlock_inode; + goto out_unlock; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); - if (err && drop_inode_on_err) { + if (err && inode) { inode_dec_link_count(inode); - iput(inode); + discard_new_inode(inode); } btrfs_btree_balance_dirty(fs_info); return err; - -out_unlock_inode: - unlock_new_inode(inode); - goto out_unlock; - } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6748,6 +6695,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); + inode = NULL; goto out_fail; } @@ -6758,34 +6706,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_fail_inode; + goto out_fail; btrfs_i_size_write(BTRFS_I(inode), 0); err = btrfs_update_inode(trans, root, inode); if (err) - goto out_fail_inode; + goto out_fail; err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), dentry->d_name.name, dentry->d_name.len, 0, index); if (err) - goto out_fail_inode; + goto out_fail; d_instantiate_new(dentry, inode); drop_on_err = 0; out_fail: btrfs_end_transaction(trans); - if (drop_on_err) { + if (err && inode) { inode_dec_link_count(inode); - iput(inode); + discard_new_inode(inode); } btrfs_btree_balance_dirty(fs_info); return err; - -out_fail_inode: - unlock_new_inode(inode); - goto out_fail; } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6847,7 +6791,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, size_t pg_offset, u64 start, u64 len, int create) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; int err = 0; u64 extent_start = 0; @@ -6943,7 +6887,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_start); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); + + size = btrfs_file_extent_ram_bytes(leaf, item); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); @@ -6994,7 +6939,7 @@ next: if (new_inline) goto out; - size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); + size = btrfs_file_extent_ram_bytes(leaf, item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_SIZE - pg_offset, size - extent_offset); @@ -7865,7 +7810,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, isector >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, pgoff, isector, repair_endio, repair_arg); - bio_set_op_attrs(bio, REQ_OP_READ, read_mode); + bio->bi_opf = REQ_OP_READ | read_mode; btrfs_debug(BTRFS_I(inode)->root->fs_info, "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d", @@ -8299,8 +8244,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, if (write && async_submit) { ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, file_offset, inode, - btrfs_submit_bio_start_direct_io, - btrfs_submit_bio_done); + btrfs_submit_bio_start_direct_io); goto err; } else if (write) { /* @@ -9005,13 +8949,14 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); -out_unlock: if (!ret2) { btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } + +out_unlock: unlock_page(page); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); @@ -9443,6 +9388,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_idx = 0; u64 root_objectid; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; @@ -9538,8 +9484,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, root, old_dir, - root_objectid, + ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, old_dentry->d_name.name, old_dentry->d_name.len); } else { /* src is an inode */ @@ -9558,8 +9503,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, dest, new_dir, - root_objectid, + ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, new_dentry->d_name.name, new_dentry->d_name.len); } else { /* dest is an inode */ @@ -9639,7 +9583,8 @@ out_fail: dest_log_pinned = false; } } - ret = btrfs_end_transaction(trans); + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9818,7 +9763,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, + ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, old_dentry->d_name.name, old_dentry->d_name.len); } else { @@ -9840,8 +9785,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { root_objectid = BTRFS_I(new_inode)->location.objectid; - ret = btrfs_unlink_subvol(trans, dest, new_dir, - root_objectid, + ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, new_dentry->d_name.name, new_dentry->d_name.len); BUG_ON(new_inode->i_nlink == 0); @@ -10112,7 +10056,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, struct btrfs_key key; struct inode *inode = NULL; int err; - int drop_inode = 0; u64 objectid; u64 index = 0; int name_len; @@ -10145,6 +10088,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, objectid, S_IFLNK|S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); + inode = NULL; goto out_unlock; } @@ -10161,12 +10105,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_unlock_inode; + goto out_unlock; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_unlock_inode; + goto out_unlock; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; @@ -10176,7 +10120,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, datasize); if (err) { btrfs_free_path(path); - goto out_unlock_inode; + goto out_unlock; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -10208,26 +10152,19 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (!err) err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); - if (err) { - drop_inode = 1; - goto out_unlock_inode; - } + if (err) + goto out_unlock; d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); - if (drop_inode) { + if (err && inode) { inode_dec_link_count(inode); - iput(inode); + discard_new_inode(inode); } btrfs_btree_balance_dirty(fs_info); return err; - -out_unlock_inode: - drop_inode = 1; - unlock_new_inode(inode); - goto out_unlock; } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -10436,14 +10373,14 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) - goto out_inode; + goto out; ret = btrfs_update_inode(trans, root, inode); if (ret) - goto out_inode; + goto out; ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) - goto out_inode; + goto out; /* * We set number of links to 0 in btrfs_new_inode(), and here we set @@ -10453,21 +10390,15 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); - unlock_new_inode(inode); d_tmpfile(dentry, inode); + unlock_new_inode(inode); mark_inode_dirty(inode); - out: btrfs_end_transaction(trans); - if (ret) - iput(inode); + if (ret && inode) + discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); return ret; - -out_inode: - unlock_new_inode(inode); - goto out; - } __attribute__((const)) @@ -10476,12 +10407,6 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) return -EAGAIN; } -static struct btrfs_fs_info *iotree_fs_info(void *private_data) -{ - struct inode *inode = private_data; - return btrfs_sb(inode->i_sb); -} - static void btrfs_check_extent_io_range(void *private_data, const char *caller, u64 start, u64 end) { @@ -10496,9 +10421,9 @@ static void btrfs_check_extent_io_range(void *private_data, const char *caller, } } -void btrfs_set_range_writeback(void *private_data, u64 start, u64 end) +void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode = private_data; + struct inode *inode = tree->private_data; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; @@ -10554,10 +10479,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - .tree_fs_info = iotree_fs_info, - .set_range_writeback = btrfs_set_range_writeback, /* optional callbacks */ .fill_delalloc = run_delalloc_range, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c2837a32d689..d3a5d2a41e5f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5,23 +5,18 @@ #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> -#include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mount.h> -#include <linux/mpage.h> #include <linux/namei.h> -#include <linux/swap.h> #include <linux/writeback.h> #include <linux/compat.h> -#include <linux/bit_spinlock.h> #include <linux/security.h> #include <linux/xattr.h> #include <linux/mm.h> @@ -606,7 +601,7 @@ static noinline int create_subvol(struct inode *dir, trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - ret = btrfs_qgroup_inherit(trans, fs_info, 0, objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) goto fail; @@ -616,14 +611,6 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(leaf, objectid); - - write_extent_buffer_fsid(leaf, fs_info->fsid); - write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item->inode; @@ -711,8 +698,7 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); - ret = btrfs_add_root_ref(trans, fs_info, - objectid, root->root_key.objectid, + ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); BUG_ON(ret); @@ -2507,8 +2493,8 @@ out: static noinline int btrfs_ioctl_ino_lookup(struct file *file, void __user *argp) { - struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; + struct btrfs_ioctl_ino_lookup_args *args; + struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); @@ -2941,8 +2927,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = btrfs_defrag_root(root); break; case S_IFREG: - if (!(file->f_mode & FMODE_WRITE)) { - ret = -EINVAL; + /* + * Note that this does not check the file descriptor for write + * access. This prevents defragmenting executables that are + * running and allows defrag on files open in read-only mode. + */ + if (!capable(CAP_SYS_ADMIN) && + inode_permission(inode, MAY_WRITE)) { + ret = -EPERM; goto out; } @@ -3165,10 +3157,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { - struct rcu_string *name; - - name = rcu_dereference(dev->name); - strncpy(di_args->path, name->str, sizeof(di_args->path) - 1); + strncpy(di_args->path, rcu_str_deref(dev->name), + sizeof(di_args->path) - 1); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; @@ -3327,11 +3317,13 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) if (pg) { unlock_page(pg); put_page(pg); + cmp->src_pages[i] = NULL; } pg = cmp->dst_pages[i]; if (pg) { unlock_page(pg); put_page(pg); + cmp->dst_pages[i] = NULL; } } } @@ -3577,7 +3569,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, dst, dst_loff, &cmp); if (ret) - goto out_unlock; + goto out_free; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3587,16 +3579,16 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff, &cmp); +out_free: + kvfree(cmp.src_pages); + kvfree(cmp.dst_pages); + out_unlock: if (same_inode) inode_unlock(src); else btrfs_double_inode_unlock(src, dst); -out_free: - kvfree(cmp.src_pages); - kvfree(cmp.dst_pages); - return ret; } @@ -5116,9 +5108,7 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_ctl_args *sa; - struct btrfs_trans_handle *trans = NULL; int ret; - int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -5134,28 +5124,19 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) } down_write(&fs_info->subvol_sem); - trans = btrfs_start_transaction(fs_info->tree_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(trans, fs_info); + ret = btrfs_quota_enable(fs_info); break; case BTRFS_QUOTA_CTL_DISABLE: - ret = btrfs_quota_disable(trans, fs_info); + ret = btrfs_quota_disable(fs_info); break; default: ret = -EINVAL; break; } - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; -out: kfree(sa); up_write(&fs_info->subvol_sem); drop_write: @@ -5193,15 +5174,13 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) } if (sa->assign) { - ret = btrfs_add_qgroup_relation(trans, fs_info, - sa->src, sa->dst); + ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); } else { - ret = btrfs_del_qgroup_relation(trans, fs_info, - sa->src, sa->dst); + ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); } /* update qgroup status and info */ - err = btrfs_run_qgroups(trans, fs_info); + err = btrfs_run_qgroups(trans); if (err < 0) btrfs_handle_fs_error(fs_info, err, "failed to update qgroup status and info"); @@ -5219,7 +5198,6 @@ drop_write: static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; @@ -5251,9 +5229,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) } if (sa->create) { - ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); + ret = btrfs_create_qgroup(trans, sa->qgroupid); } else { - ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid); + ret = btrfs_remove_qgroup(trans, sa->qgroupid); } err = btrfs_end_transaction(trans); @@ -5270,7 +5248,6 @@ drop_write: static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; @@ -5303,7 +5280,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = root->root_key.objectid; } - ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); + ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); if (err && !ret) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2e1a1694a33d..0c4ef208b8b9 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -6,7 +6,6 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/writeback.h> -#include <linux/pagevec.h> #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" @@ -421,129 +420,6 @@ out: return ret == 0; } -/* Needs to either be called under a log transaction or the log_mutex */ -void btrfs_get_logged_extents(struct btrfs_inode *inode, - struct list_head *logged_list, - const loff_t start, - const loff_t end) -{ - struct btrfs_ordered_inode_tree *tree; - struct btrfs_ordered_extent *ordered; - struct rb_node *n; - struct rb_node *prev; - - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - n = __tree_search(&tree->tree, end, &prev); - if (!n) - n = prev; - for (; n; n = rb_prev(n)) { - ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); - if (ordered->file_offset > end) - continue; - if (entry_end(ordered) <= start) - break; - if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) - continue; - list_add(&ordered->log_list, logged_list); - refcount_inc(&ordered->refs); - } - spin_unlock_irq(&tree->lock); -} - -void btrfs_put_logged_extents(struct list_head *logged_list) -{ - struct btrfs_ordered_extent *ordered; - - while (!list_empty(logged_list)) { - ordered = list_first_entry(logged_list, - struct btrfs_ordered_extent, - log_list); - list_del_init(&ordered->log_list); - btrfs_put_ordered_extent(ordered); - } -} - -void btrfs_submit_logged_extents(struct list_head *logged_list, - struct btrfs_root *log) -{ - int index = log->log_transid % 2; - - spin_lock_irq(&log->log_extents_lock[index]); - list_splice_tail(logged_list, &log->logged_list[index]); - spin_unlock_irq(&log->log_extents_lock[index]); -} - -void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *log, u64 transid) -{ - struct btrfs_ordered_extent *ordered; - int index = transid % 2; - - spin_lock_irq(&log->log_extents_lock[index]); - while (!list_empty(&log->logged_list[index])) { - struct inode *inode; - ordered = list_first_entry(&log->logged_list[index], - struct btrfs_ordered_extent, - log_list); - list_del_init(&ordered->log_list); - inode = ordered->inode; - spin_unlock_irq(&log->log_extents_lock[index]); - - if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - u64 start = ordered->file_offset; - u64 end = ordered->file_offset + ordered->len - 1; - - WARN_ON(!inode); - filemap_fdatawrite_range(inode->i_mapping, start, end); - } - wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, - &ordered->flags)); - - /* - * In order to keep us from losing our ordered extent - * information when committing the transaction we have to make - * sure that any logged extents are completed when we go to - * commit the transaction. To do this we simply increase the - * current transactions pending_ordered counter and decrement it - * when the ordered extent completes. - */ - if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock_irq(&tree->lock); - if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); - atomic_inc(&trans->transaction->pending_ordered); - } - spin_unlock_irq(&tree->lock); - } - btrfs_put_ordered_extent(ordered); - spin_lock_irq(&log->log_extents_lock[index]); - } - spin_unlock_irq(&log->log_extents_lock[index]); -} - -void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) -{ - struct btrfs_ordered_extent *ordered; - int index = transid % 2; - - spin_lock_irq(&log->log_extents_lock[index]); - while (!list_empty(&log->logged_list[index])) { - ordered = list_first_entry(&log->logged_list[index], - struct btrfs_ordered_extent, - log_list); - list_del_init(&ordered->log_list); - spin_unlock_irq(&log->log_extents_lock[index]); - btrfs_put_ordered_extent(ordered); - spin_lock_irq(&log->log_extents_lock[index]); - } - spin_unlock_irq(&log->log_extents_lock[index]); -} - /* * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped @@ -913,20 +789,6 @@ out: return entry; } -bool btrfs_have_ordered_extents_in_range(struct inode *inode, - u64 file_offset, - u64 len) -{ - struct btrfs_ordered_extent *oe; - - oe = btrfs_lookup_ordered_range(BTRFS_I(inode), file_offset, len); - if (oe) { - btrfs_put_ordered_extent(oe); - return true; - } - return false; -} - /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3be443fb3001..02d813aaa261 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -54,15 +54,11 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent * has done its due diligence in updating * the isize. */ -#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered - ordered extent */ -#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ +#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ -#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent - * in the logging code. */ -#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to +#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to * complete in the current transaction. */ -#define BTRFS_ORDERED_REGULAR 12 /* Regular IO for COW */ +#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ struct btrfs_ordered_extent { /* logical offset in the file */ @@ -182,9 +178,6 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -bool btrfs_have_ordered_extents_in_range(struct inode *inode, - u64 file_offset, - u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, @@ -193,16 +186,6 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); -void btrfs_get_logged_extents(struct btrfs_inode *inode, - struct list_head *logged_list, - const loff_t start, - const loff_t end); -void btrfs_put_logged_extents(struct list_head *logged_list); -void btrfs_submit_logged_extents(struct list_head *logged_list, - struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *log, u64 transid); -void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index a4e11cf04671..df49931ffe92 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -52,17 +52,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) u64 offset; int ref_index = 0; - if (item_size < sizeof(*ei)) { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); - pr_info("\t\textent refs %u\n", - btrfs_extent_refs_v0(eb, ei0)); - return; -#else - BUG(); -#endif + if (unlikely(item_size < sizeof(*ei))) { + btrfs_print_v0_err(eb->fs_info); + btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -133,20 +125,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) WARN_ON(ptr > end); } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static void print_extent_ref_v0(struct extent_buffer *eb, int slot) -{ - struct btrfs_extent_ref_v0 *ref0; - - ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); - printk("\t\textent back ref root %llu gen %llu owner %llu num_refs %lu\n", - btrfs_ref_root_v0(eb, ref0), - btrfs_ref_generation_v0(eb, ref0), - btrfs_ref_objectid_v0(eb, ref0), - (unsigned long)btrfs_ref_count_v0(eb, ref0)); -} -#endif - static void print_uuid_item(struct extent_buffer *l, unsigned long offset, u32 item_size) { @@ -267,8 +245,8 @@ void btrfs_print_leaf(struct extent_buffer *l) struct btrfs_file_extent_item); if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %u\n", - btrfs_file_extent_inline_len(l, i, fi)); + pr_info("\t\tinline extent data size %llu\n", + btrfs_file_extent_ram_bytes(l, fi)); break; } pr_info("\t\textent data disk bytenr %llu nr %llu\n", @@ -280,11 +258,8 @@ void btrfs_print_leaf(struct extent_buffer *l) btrfs_file_extent_ram_bytes(l, fi)); break; case BTRFS_EXTENT_REF_V0_KEY: -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - print_extent_ref_v0(l, i); -#else - BUG(); -#endif + btrfs_print_v0_err(fs_info); + btrfs_handle_fs_error(fs_info, -EINVAL, NULL); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1874a6d2e6f5..4353bb69bb86 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -530,11 +530,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) fs_info->qgroup_ulist = NULL; } -static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, - struct btrfs_root *quota_root, - u64 src, u64 dst) +static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, + u64 dst) { int ret; + struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; @@ -554,11 +554,11 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, return ret; } -static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, - struct btrfs_root *quota_root, - u64 src, u64 dst) +static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, + u64 dst) { int ret; + struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; @@ -653,10 +653,10 @@ out: return ret; } -static int del_qgroup_item(struct btrfs_trans_handle *trans, - struct btrfs_root *quota_root, u64 qgroupid) +static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) { int ret; + struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; @@ -700,9 +700,9 @@ out: } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_qgroup *qgroup) { + struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; @@ -718,7 +718,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -742,9 +742,10 @@ out: } static int update_qgroup_info_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_qgroup *qgroup) { + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; @@ -752,7 +753,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; - if (btrfs_is_testing(root->fs_info)) + if (btrfs_is_testing(fs_info)) return 0; key.objectid = 0; @@ -763,7 +764,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -786,10 +787,10 @@ out: return ret; } -static int update_qgroup_status_item(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_root *root) +static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; @@ -805,7 +806,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -875,8 +876,7 @@ out: return ret; } -int btrfs_quota_enable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_quota_enable(struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; @@ -886,6 +886,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; + struct btrfs_trans_handle *trans = NULL; int ret = 0; int slot; @@ -893,9 +894,25 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, if (fs_info->quota_root) goto out; + /* + * 1 for quota root item + * 1 for BTRFS_QGROUP_STATUS item + * + * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items + * per subvolume. However those are not currently reserved since it + * would be a lot of overkill. + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); if (!fs_info->qgroup_ulist) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -906,12 +923,14 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) { ret = PTR_ERR(quota_root); + btrfs_abort_transaction(trans, ret); goto out; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out_free_root; } @@ -921,8 +940,10 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out_free_path; + } leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], @@ -944,9 +965,10 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out_free_path; - + } while (1) { slot = path->slots[0]; @@ -956,18 +978,23 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, if (found_key.type == BTRFS_ROOT_REF_KEY) { ret = add_qgroup_item(trans, quota_root, found_key.offset); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out_free_path; + } qgroup = add_qgroup_rb(fs_info, found_key.offset); if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); + btrfs_abort_transaction(trans, ret); goto out_free_path; } } ret = btrfs_next_item(tree_root, path); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto out_free_path; + } if (ret) break; } @@ -975,18 +1002,28 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out_free_path; + } qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); + btrfs_abort_transaction(trans, ret); goto out_free_path; } spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_unlock(&fs_info->qgroup_lock); + + ret = btrfs_commit_transaction(trans); + if (ret) { + trans = NULL; + goto out_free_path; + } + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1006,20 +1043,35 @@ out: if (ret) { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; + if (trans) + btrfs_end_transaction(trans); } mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -int btrfs_quota_disable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root; + struct btrfs_trans_handle *trans = NULL; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + + /* + * 1 For the root item + * + * We should also reserve enough items for the quota tree deletion in + * btrfs_clean_quota_tree but this is not done. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); @@ -1031,12 +1083,16 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); - if (ret) - goto out; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto end_trans; + } - ret = btrfs_del_root(trans, fs_info, "a_root->root_key); - if (ret) - goto out; + ret = btrfs_del_root(trans, "a_root->root_key); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto end_trans; + } list_del("a_root->dirty_list); @@ -1048,6 +1104,9 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, free_extent_buffer(quota_root->node); free_extent_buffer(quota_root->commit_root); kfree(quota_root); + +end_trans: + ret = btrfs_end_transaction(trans); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -1177,9 +1236,10 @@ out: return ret; } -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst) +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; @@ -1216,13 +1276,13 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, } } - ret = add_qgroup_relation_item(trans, quota_root, src, dst); + ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; - ret = add_qgroup_relation_item(trans, quota_root, dst, src); + ret = add_qgroup_relation_item(trans, dst, src); if (ret) { - del_qgroup_relation_item(trans, quota_root, src, dst); + del_qgroup_relation_item(trans, src, dst); goto out; } @@ -1240,9 +1300,10 @@ out: return ret; } -static int __del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst) +static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; @@ -1276,8 +1337,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; exist: - ret = del_qgroup_relation_item(trans, quota_root, src, dst); - err = del_qgroup_relation_item(trans, quota_root, dst, src); + ret = del_qgroup_relation_item(trans, src, dst); + err = del_qgroup_relation_item(trans, dst, src); if (err && !ret) ret = err; @@ -1290,21 +1351,22 @@ out: return ret; } -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst) +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - ret = __del_qgroup_relation(trans, fs_info, src, dst); + ret = __del_qgroup_relation(trans, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid) +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; @@ -1336,9 +1398,9 @@ out: return ret; } -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid) +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; @@ -1362,16 +1424,15 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, goto out; } } - ret = del_qgroup_item(trans, quota_root, qgroupid); + ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); - ret = __del_qgroup_relation(trans, fs_info, - qgroupid, - list->group->qgroupid); + ret = __del_qgroup_relation(trans, qgroupid, + list->group->qgroupid); if (ret) goto out; } @@ -1384,10 +1445,10 @@ out: return ret; } -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; @@ -1451,7 +1512,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, spin_unlock(&fs_info->qgroup_lock); - ret = update_qgroup_limit_item(trans, quota_root, qgroup); + ret = update_qgroup_limit_item(trans, qgroup); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_info(fs_info, "unable to update quota limit for %llu", @@ -1519,10 +1580,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - gfp_t gfp_flag) +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes, gfp_t gfp_flag) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; int ret; @@ -1530,8 +1591,6 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - if (WARN_ON(trans == NULL)) - return -EINVAL; record = kmalloc(sizeof(*record), gfp_flag); if (!record) return -ENOMEM; @@ -1552,9 +1611,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, } int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = trans->fs_info; int nr = btrfs_header_nritems(eb); int i, extent_type, ret; struct btrfs_key key; @@ -1584,8 +1643,8 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = btrfs_qgroup_trace_extent(trans, fs_info, bytenr, - num_bytes, GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, + GFP_NOFS); if (ret) return ret; } @@ -1655,11 +1714,10 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) } int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *root_eb, u64 root_gen, int root_level) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level; struct extent_buffer *eb = root_eb; @@ -1678,7 +1736,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, } if (root_level == 0) { - ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, root_eb); + ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); goto out; } @@ -1736,8 +1794,7 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = btrfs_qgroup_trace_extent(trans, fs_info, - child_bytenr, + ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize, GFP_NOFS); if (ret) @@ -1745,8 +1802,8 @@ walk_down: } if (level == 0) { - ret = btrfs_qgroup_trace_leaf_items(trans,fs_info, - path->nodes[level]); + ret = btrfs_qgroup_trace_leaf_items(trans, + path->nodes[level]); if (ret) goto out; @@ -1981,12 +2038,11 @@ static int maybe_fs_roots(struct ulist *roots) return is_fstree(unode->val); } -int -btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, - struct ulist *old_roots, struct ulist *new_roots) +int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes, struct ulist *old_roots, + struct ulist *new_roots) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct ulist *qgroups = NULL; struct ulist *tmp = NULL; u64 seq; @@ -2116,9 +2172,10 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, fs_info, - record->bytenr, record->num_bytes, - record->old_roots, new_roots); + ret = btrfs_qgroup_account_extent(trans, record->bytenr, + record->num_bytes, + record->old_roots, + new_roots); record->old_roots = NULL; new_roots = NULL; } @@ -2136,9 +2193,9 @@ cleanup: /* * called from commit_transaction. Writes all changed qgroups to disk. */ -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; @@ -2152,11 +2209,11 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_qgroup, dirty); list_del_init(&qgroup->dirty); spin_unlock(&fs_info->qgroup_lock); - ret = update_qgroup_info_item(trans, quota_root, qgroup); + ret = update_qgroup_info_item(trans, qgroup); if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - ret = update_qgroup_limit_item(trans, quota_root, qgroup); + ret = update_qgroup_limit_item(trans, qgroup); if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2168,7 +2225,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); - ret = update_qgroup_status_item(trans, fs_info, quota_root); + ret = update_qgroup_status_item(trans); if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2181,13 +2238,13 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. */ -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, - struct btrfs_qgroup_inherit *inherit) +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, + u64 objectid, struct btrfs_qgroup_inherit *inherit) { int ret = 0; int i; u64 *i_qgroups; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; @@ -2229,22 +2286,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (ret) goto out; - if (srcid) { - struct btrfs_root *srcroot; - struct btrfs_key srckey; - - srckey.objectid = srcid; - srckey.type = BTRFS_ROOT_ITEM_KEY; - srckey.offset = (u64)-1; - srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey); - if (IS_ERR(srcroot)) { - ret = PTR_ERR(srcroot); - goto out; - } - - level_size = fs_info->nodesize; - } - /* * add qgroup to all inherited groups */ @@ -2253,12 +2294,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { if (*i_qgroups == 0) continue; - ret = add_qgroup_relation_item(trans, quota_root, - objectid, *i_qgroups); + ret = add_qgroup_relation_item(trans, objectid, + *i_qgroups); if (ret && ret != -EEXIST) goto out; - ret = add_qgroup_relation_item(trans, quota_root, - *i_qgroups, objectid); + ret = add_qgroup_relation_item(trans, *i_qgroups, + objectid); if (ret && ret != -EEXIST) goto out; } @@ -2281,7 +2322,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, quota_root, dstgroup); + ret = update_qgroup_limit_item(trans, dstgroup); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_info(fs_info, @@ -2301,6 +2342,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, * our counts don't go crazy, so at this point the only * difference between the two roots should be the root node. */ + level_size = fs_info->nodesize; dstgroup->rfer = srcgroup->rfer; dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; dstgroup->excl = level_size; @@ -2598,10 +2640,10 @@ static bool is_last_leaf(struct btrfs_path *path) * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. */ -static int -qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans) +static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, + struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; @@ -2669,8 +2711,8 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ - ret = btrfs_qgroup_account_extent(trans, fs_info, - found.objectid, num_bytes, NULL, roots); + ret = btrfs_qgroup_account_extent(trans, found.objectid, + num_bytes, NULL, roots); if (ret < 0) goto out; } @@ -2680,8 +2722,10 @@ out: free_extent_buffer(scratch_leaf); } - if (done && !ret) + if (done && !ret) { ret = 1; + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + } return ret; } @@ -2714,7 +2758,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { err = -EINTR; } else { - err = qgroup_rescan_leaf(fs_info, path, trans); + err = qgroup_rescan_leaf(trans, path); } if (err > 0) btrfs_commit_transaction(trans); @@ -2749,7 +2793,7 @@ out: err); goto done; } - ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); + ret = update_qgroup_status_item(trans); if (ret < 0) { err = ret; btrfs_err(fs_info, "fail to update qgroup status: %d", err); @@ -2784,13 +2828,20 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (!init_flags) { /* we're resuming qgroup rescan at mount time */ - if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) + if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, "qgroup rescan init failed, qgroup rescan is not queued"); - return -EINVAL; + ret = -EINVAL; + } + + if (ret) + return ret; } mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d60dd06445ce..54b8bb282c0e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -141,24 +141,19 @@ struct btrfs_qgroup { #define QGROUP_RELEASE (1<<1) #define QGROUP_FREE (1<<2) -int btrfs_quota_enable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_quota_disable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_quota_enable(struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); @@ -217,9 +212,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans) */ -int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - gfp_t gfp_flag); +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes, gfp_t gfp_flag); /* * Inform qgroup to trace all leaf items of data @@ -228,7 +222,6 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, * Return <0 for error(ENOMEM) */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *eb); /* * Inform qgroup to trace a whole subtree, including all its child tree @@ -241,20 +234,15 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, * Return <0 for error(ENOMEM or tree search error) */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *root_eb, u64 root_gen, int root_level); -int -btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, - struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes, struct ulist *old_roots, + struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, - struct btrfs_qgroup_inherit *inherit); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, + u64 objectid, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5e4ad134b9ad..df41d7049936 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -5,32 +5,19 @@ */ #include <linux/sched.h> -#include <linux/wait.h> #include <linux/bio.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/random.h> -#include <linux/iocontext.h> -#include <linux/capability.h> -#include <linux/ratelimit.h> -#include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> -#include <asm/div64.h> #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" -#include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" -#include "check-integrity.h" -#include "rcu-string.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -175,8 +162,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct btrfs_work *work); static void read_rebuild_work(struct btrfs_work *work); -static void async_rmw_stripe(struct btrfs_raid_bio *rbio); -static void async_read_rebuild(struct btrfs_raid_bio *rbio); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void __free_raid_bio(struct btrfs_raid_bio *rbio); @@ -185,7 +170,13 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); -static void async_scrub_parity(struct btrfs_raid_bio *rbio); +static void scrub_parity_work(struct btrfs_work *work); + +static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL); + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); +} /* * the stripe hash table is used for locking, and to collect @@ -260,7 +251,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) s = kmap(rbio->bio_pages[i]); d = kmap(rbio->stripe_pages[i]); - memcpy(d, s, PAGE_SIZE); + copy_page(d, s); kunmap(rbio->bio_pages[i]); kunmap(rbio->stripe_pages[i]); @@ -516,32 +507,21 @@ static void run_xor(void **pages, int src_cnt, ssize_t len) } /* - * returns true if the bio list inside this rbio - * covers an entire stripe (no rmw required). - * Must be called with the bio list lock held, or - * at a time when you know it is impossible to add - * new bios into the list + * Returns true if the bio list inside this rbio covers an entire stripe (no + * rmw required). */ -static int __rbio_is_full(struct btrfs_raid_bio *rbio) +static int rbio_is_full(struct btrfs_raid_bio *rbio) { + unsigned long flags; unsigned long size = rbio->bio_list_bytes; int ret = 1; + spin_lock_irqsave(&rbio->bio_list_lock, flags); if (size != rbio->nr_data * rbio->stripe_len) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); - return ret; -} - -static int rbio_is_full(struct btrfs_raid_bio *rbio) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&rbio->bio_list_lock, flags); - ret = __rbio_is_full(rbio); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + return ret; } @@ -812,16 +792,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock_irqrestore(&h->lock, flags); if (next->operation == BTRFS_RBIO_READ_REBUILD) - async_read_rebuild(next); + start_async_work(next, read_rebuild_work); else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { steal_rbio(rbio, next); - async_read_rebuild(next); + start_async_work(next, read_rebuild_work); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - async_rmw_stripe(next); + start_async_work(next, rmw_work); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - async_scrub_parity(next); + start_async_work(next, scrub_parity_work); } goto done_nolock; @@ -1275,7 +1255,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) pointers); } else { /* raid5 */ - memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + copy_page(pointers[nr_data], pointers[0]); run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } @@ -1343,7 +1323,7 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; submit_bio(bio); } @@ -1508,20 +1488,6 @@ cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); } -static void async_rmw_stripe(struct btrfs_raid_bio *rbio) -{ - btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); -} - -static void async_read_rebuild(struct btrfs_raid_bio *rbio) -{ - btrfs_init_work(&rbio->work, btrfs_rmw_helper, - read_rebuild_work, NULL, NULL); - - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); -} - /* * the stripe must be locked by the caller. It will * unlock after all the writes are done @@ -1599,7 +1565,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -1652,7 +1618,7 @@ static int partial_stripe_write(struct btrfs_raid_bio *rbio) ret = lock_stripe_add(rbio); if (ret == 0) - async_rmw_stripe(rbio); + start_async_work(rbio, rmw_work); return 0; } @@ -1720,8 +1686,11 @@ static void run_plug(struct btrfs_plug_cb *plug) list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { + int ret; + /* we have a full stripe, send it down */ - full_stripe_write(cur); + ret = full_stripe_write(cur); + BUG_ON(ret); continue; } if (last) { @@ -1941,9 +1910,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) BUG_ON(failb != -1); pstripe: /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], - pointers[rbio->nr_data], - PAGE_SIZE); + copy_page(pointers[faila], pointers[rbio->nr_data]); /* rearrange the pointer array */ p = pointers[faila]; @@ -2145,7 +2112,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -2448,7 +2415,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, pointers); } else { /* raid5 */ - memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + copy_page(pointers[nr_data], pointers[0]); run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } @@ -2456,7 +2423,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) - memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE); + copy_page(parity, pointers[rbio->scrubp]); else /* Parity is right, needn't writeback */ bitmap_clear(rbio->dbitmap, pagenr, 1); @@ -2517,7 +2484,7 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; submit_bio(bio); } @@ -2699,7 +2666,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -2728,18 +2695,10 @@ static void scrub_parity_work(struct btrfs_work *work) raid56_parity_scrub_stripe(rbio); } -static void async_scrub_parity(struct btrfs_raid_bio *rbio) -{ - btrfs_init_work(&rbio->work, btrfs_rmw_helper, - scrub_parity_work, NULL, NULL); - - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); -} - void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - async_scrub_parity(rbio); + start_async_work(rbio, scrub_parity_work); } /* The following code is used for dev replace of a missing RAID 5/6 device. */ @@ -2781,5 +2740,5 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - async_read_rebuild(rbio); + start_async_work(rbio, read_rebuild_work); } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 40f1bcef394d..dec14b739b10 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -7,7 +7,6 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> -#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "ctree.h" @@ -355,7 +354,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, dev = bbio->stripes[nzones].dev; /* cannot read ahead on missing device. */ - if (!dev->bdev) + if (!dev->bdev) continue; zone = reada_find_zone(dev, logical, bbio); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 879b76fa881a..8783a1776540 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -586,29 +586,6 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, return btrfs_get_fs_root(fs_info, &key, false); } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static noinline_for_stack -struct btrfs_root *find_tree_root(struct reloc_control *rc, - struct extent_buffer *leaf, - struct btrfs_extent_ref_v0 *ref0) -{ - struct btrfs_root *root; - u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); - u64 generation = btrfs_ref_generation_v0(leaf, ref0); - - BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); - - root = read_fs_root(rc->extent_root->fs_info, root_objectid); - BUG_ON(IS_ERR(root)); - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - generation != btrfs_root_generation(&root->root_item)) - return NULL; - - return root; -} -#endif - static noinline_for_stack int find_inline_backref(struct extent_buffer *leaf, int slot, unsigned long *ptr, unsigned long *end) @@ -621,12 +598,11 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, btrfs_item_key_to_cpu(leaf, &key, slot); item_size = btrfs_item_size_nr(leaf, slot); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { - WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + btrfs_print_v0_err(leaf->fs_info); + btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL); return 1; } -#endif ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); WARN_ON(!(btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)); @@ -792,7 +768,7 @@ again: type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); if (type == BTRFS_REF_TYPE_INVALID) { - err = -EINVAL; + err = -EUCLEAN; goto out; } key.type = type; @@ -811,29 +787,7 @@ again: goto next; } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || - key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(eb, path1->slots[0], - struct btrfs_extent_ref_v0); - if (key.objectid == key.offset) { - root = find_tree_root(rc, eb, ref0); - if (root && !should_ignore_root(root)) - cur->root = root; - else - list_add(&cur->list, &useless); - break; - } - if (is_cowonly_root(btrfs_ref_root_v0(eb, - ref0))) - cur->cowonly = 1; - } -#else - ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY); if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { -#endif if (key.objectid == key.offset) { /* * only root blocks of reloc trees use @@ -876,6 +830,12 @@ again: edge->node[UPPER] = upper; goto next; + } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { + err = -EINVAL; + btrfs_print_v0_err(rc->extent_root->fs_info); + btrfs_handle_fs_error(rc->extent_root->fs_info, err, + NULL); + goto out; } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { goto next; } @@ -1321,18 +1281,19 @@ static void __del_reloc_root(struct btrfs_root *root) struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); - if (rb_node) { - node = rb_entry(rb_node, struct mapping_node, rb_node); - rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + if (rc) { + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + if (!node) + return; + BUG_ON((struct btrfs_root *)node->data != root); } - spin_unlock(&rc->reloc_root_tree.lock); - - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&fs_info->trans_lock); list_del_init(&root->root_list); @@ -1918,13 +1879,12 @@ again: * and tree block numbers, if current trans doesn't free * data reloc tree inode. */ - ret = btrfs_qgroup_trace_subtree(trans, src, parent, + ret = btrfs_qgroup_trace_subtree(trans, parent, btrfs_header_generation(parent), btrfs_header_level(parent)); if (ret < 0) break; - ret = btrfs_qgroup_trace_subtree(trans, dest, - path->nodes[level], + ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level], btrfs_header_generation(path->nodes[level]), btrfs_header_level(path->nodes[level])); if (ret < 0) @@ -3333,48 +3293,6 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, return 0; } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static int get_ref_objectid_v0(struct reloc_control *rc, - struct btrfs_path *path, - struct btrfs_key *extent_key, - u64 *ref_objectid, int *path_change) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_ref_v0 *ref0; - int ret; - int slot; - - leaf = path->nodes[0]; - slot = path->slots[0]; - while (1) { - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) - return ret; - BUG_ON(ret > 0); - leaf = path->nodes[0]; - slot = path->slots[0]; - if (path_change) - *path_change = 1; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != extent_key->objectid) - return -ENOENT; - - if (key.type != BTRFS_EXTENT_REF_V0_KEY) { - slot++; - continue; - } - ref0 = btrfs_item_ptr(leaf, slot, - struct btrfs_extent_ref_v0); - *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); - break; - } - return 0; -} -#endif - /* * helper to add a tree block to the list. * the major work is getting the generation and level of the block @@ -3407,23 +3325,12 @@ static int add_tree_block(struct reloc_control *rc, level = (int)extent_key->offset; } generation = btrfs_extent_generation(eb, ei); + } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { + btrfs_print_v0_err(eb->fs_info); + btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); + return -EINVAL; } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - u64 ref_owner; - int ret; - - BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - ret = get_ref_objectid_v0(rc, path, extent_key, - &ref_owner, NULL); - if (ret < 0) - return ret; - BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); - level = (int)ref_owner; - /* FIXME: get real generation */ - generation = 0; -#else BUG(); -#endif } btrfs_release_path(path); @@ -3563,11 +3470,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, root, NULL); - if (IS_ERR(inode) || is_bad_inode(inode)) { - if (!IS_ERR(inode)) - iput(inode); + if (IS_ERR(inode)) return -ENOENT; - } truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3781,12 +3685,7 @@ int add_data_references(struct reloc_control *rc, eb = path->nodes[0]; ptr = btrfs_item_ptr_offset(eb, path->slots[0]); end = ptr + btrfs_item_size_nr(eb, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (ptr + sizeof(struct btrfs_extent_item_v0) == end) - ptr = end; - else -#endif - ptr += sizeof(struct btrfs_extent_item); + ptr += sizeof(struct btrfs_extent_item); while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; @@ -3801,7 +3700,7 @@ int add_data_references(struct reloc_control *rc, ret = find_data_references(rc, extent_key, eb, dref, blocks); } else { - ret = -EINVAL; + ret = -EUCLEAN; btrfs_err(rc->extent_root->fs_info, "extent %llu slot %d has an invalid inline ref type", eb->start, path->slots[0]); @@ -3832,13 +3731,7 @@ int add_data_references(struct reloc_control *rc, if (key.objectid != extent_key->objectid) break; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (key.type == BTRFS_SHARED_DATA_REF_KEY || - key.type == BTRFS_EXTENT_REF_V0_KEY) { -#else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (key.type == BTRFS_SHARED_DATA_REF_KEY) { -#endif ret = __add_tree_block(rc, key.offset, blocksize, blocks); } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { @@ -3846,6 +3739,10 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_data_ref); ret = find_data_references(rc, extent_key, eb, dref, blocks); + } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { + btrfs_print_v0_err(eb->fs_info); + btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); + ret = -EINVAL; } else { ret = 0; } @@ -4084,41 +3981,13 @@ restart: flags = btrfs_extent_flags(path->nodes[0], ei); ret = check_extent_flags(flags); BUG_ON(ret); - + } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { + err = -EINVAL; + btrfs_print_v0_err(trans->fs_info); + btrfs_abort_transaction(trans, err); + break; } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - u64 ref_owner; - int path_change = 0; - - BUG_ON(item_size != - sizeof(struct btrfs_extent_item_v0)); - ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, - &path_change); - if (ret < 0) { - err = ret; - break; - } - if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) - flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; - else - flags = BTRFS_EXTENT_FLAG_DATA; - - if (path_change) { - btrfs_release_path(path); - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); - if (ret < 0) { - err = ret; - break; - } - BUG_ON(ret > 0); - } -#else BUG(); -#endif } if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { @@ -4169,8 +4038,7 @@ restart: } } if (trans && progress && err == -ENOSPC) { - ret = btrfs_force_chunk_alloc(trans, fs_info, - rc->block_group->flags); + ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); if (ret == 1) { err = 0; progress = 0; @@ -4284,7 +4152,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, root, NULL); - BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BUG_ON(IS_ERR(inode)); BTRFS_I(inode)->index_cnt = group->key.objectid; err = btrfs_orphan_add(trans, BTRFS_I(inode)); @@ -4375,7 +4243,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - ret = btrfs_inc_block_group_ro(fs_info, rc->block_group); + ret = btrfs_inc_block_group_ro(rc->block_group); if (ret) { err = ret; goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c451285976ac..65bda0682928 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -320,9 +320,9 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) /* drop the root item for 'key' from the tree root */ int btrfs_del_root(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, const struct btrfs_key *key) + const struct btrfs_key *key) { - struct btrfs_root *root = fs_info->tree_root; + struct btrfs_root *root = trans->fs_info->tree_root; struct btrfs_path *path; int ret; @@ -341,13 +341,12 @@ out: return ret; } -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const char *name, int name_len) +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, const char *name, + int name_len) { - struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -413,12 +412,11 @@ out: * * Will return 0, -ENOMEM, or anything from the CoW path */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const char *name, int name_len) +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, const char *name, + int name_len) { - struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; int ret; struct btrfs_path *path; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 572306036477..3be1456b5116 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -188,32 +188,6 @@ struct scrub_ctx { refcount_t refs; }; -struct scrub_fixup_nodatasum { - struct scrub_ctx *sctx; - struct btrfs_device *dev; - u64 logical; - struct btrfs_root *root; - struct btrfs_work work; - int mirror_num; -}; - -struct scrub_nocow_inode { - u64 inum; - u64 offset; - u64 root; - struct list_head list; -}; - -struct scrub_copy_nocow_ctx { - struct scrub_ctx *sctx; - u64 logical; - u64 len; - int mirror_num; - u64 physical_for_dev_replace; - struct list_head inodes; - struct btrfs_work work; -}; - struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -232,8 +206,6 @@ struct full_stripe_lock { static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); -static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); -static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); @@ -277,13 +249,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); -static int write_page_nocow(struct scrub_ctx *sctx, - u64 physical_for_dev_replace, struct page *page); -static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, - struct scrub_copy_nocow_ctx *ctx); -static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - int mirror_num, u64 physical_for_dev_replace); -static void copy_nocow_pages_worker(struct btrfs_work *work); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); @@ -555,60 +520,6 @@ out: return ret; } -/* - * used for workers that require transaction commits (i.e., for the - * NOCOW case) - */ -static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) -{ - struct btrfs_fs_info *fs_info = sctx->fs_info; - - refcount_inc(&sctx->refs); - /* - * increment scrubs_running to prevent cancel requests from - * completing as long as a worker is running. we must also - * increment scrubs_paused to prevent deadlocking on pause - * requests used for transactions commits (as the worker uses a - * transaction context). it is safe to regard the worker - * as paused for all matters practical. effectively, we only - * avoid cancellation requests from completing. - */ - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrubs_running); - atomic_inc(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - - /* - * check if @scrubs_running=@scrubs_paused condition - * inside wait_event() is not an atomic operation. - * which means we may inc/dec @scrub_running/paused - * at any time. Let's wake up @scrub_pause_wait as - * much as we can to let commit transaction blocked less. - */ - wake_up(&fs_info->scrub_pause_wait); - - atomic_inc(&sctx->workers_pending); -} - -/* used for workers that require transaction commits */ -static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) -{ - struct btrfs_fs_info *fs_info = sctx->fs_info; - - /* - * see scrub_pending_trans_workers_inc() why we're pretending - * to be paused in the scrub counters - */ - mutex_lock(&fs_info->scrub_lock); - atomic_dec(&fs_info->scrubs_running); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sctx->workers_pending); - wake_up(&fs_info->scrub_pause_wait); - wake_up(&sctx->list_wait); - scrub_put_ctx(sctx); -} - static void scrub_free_csums(struct scrub_ctx *sctx) { while (!list_empty(&sctx->csum_list)) { @@ -882,194 +793,6 @@ out: btrfs_free_path(path); } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) -{ - struct page *page = NULL; - unsigned long index; - struct scrub_fixup_nodatasum *fixup = fixup_ctx; - int ret; - int corrected = 0; - struct btrfs_key key; - struct inode *inode = NULL; - struct btrfs_fs_info *fs_info; - u64 end = offset + PAGE_SIZE - 1; - struct btrfs_root *local_root; - int srcu_index; - - key.objectid = root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - fs_info = fixup->root->fs_info; - srcu_index = srcu_read_lock(&fs_info->subvol_srcu); - - local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); - return PTR_ERR(local_root); - } - - key.type = BTRFS_INODE_ITEM_KEY; - key.objectid = inum; - key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); - srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - index = offset >> PAGE_SHIFT; - - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; - goto out; - } - - if (PageUptodate(page)) { - if (PageDirty(page)) { - /* - * we need to write the data to the defect sector. the - * data that was in that sector is not in memory, - * because the page was modified. we must not write the - * modified page to that sector. - * - * TODO: what could be done here: wait for the delalloc - * runner to write out that page (might involve - * COW) and see whether the sector is still - * referenced afterwards. - * - * For the meantime, we'll treat this error - * incorrectable, although there is a chance that a - * later scrub will find the bad sector again and that - * there's no dirty page in memory, then. - */ - ret = -EIO; - goto out; - } - ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, - fixup->logical, page, - offset - page_offset(page), - fixup->mirror_num); - unlock_page(page); - corrected = !ret; - } else { - /* - * we need to get good data first. the general readpage path - * will call repair_io_failure for us, we just have to make - * sure we read the bad mirror. - */ - ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED); - if (ret) { - /* set_extent_bits should give proper error */ - WARN_ON(ret > 0); - if (ret > 0) - ret = -EFAULT; - goto out; - } - - ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, - btrfs_get_extent, - fixup->mirror_num); - wait_on_page_locked(page); - - corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, - end, EXTENT_DAMAGED, 0, NULL); - if (!corrected) - clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED); - } - -out: - if (page) - put_page(page); - - iput(inode); - - if (ret < 0) - return ret; - - if (ret == 0 && corrected) { - /* - * we only need to call readpage for one of the inodes belonging - * to this extent. so make iterate_extent_inodes stop - */ - return 1; - } - - return -EIO; -} - -static void scrub_fixup_nodatasum(struct btrfs_work *work) -{ - struct btrfs_fs_info *fs_info; - int ret; - struct scrub_fixup_nodatasum *fixup; - struct scrub_ctx *sctx; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; - int uncorrectable = 0; - - fixup = container_of(work, struct scrub_fixup_nodatasum, work); - sctx = fixup->sctx; - fs_info = fixup->root->fs_info; - - path = btrfs_alloc_path(); - if (!path) { - spin_lock(&sctx->stat_lock); - ++sctx->stat.malloc_errors; - spin_unlock(&sctx->stat_lock); - uncorrectable = 1; - goto out; - } - - trans = btrfs_join_transaction(fixup->root); - if (IS_ERR(trans)) { - uncorrectable = 1; - goto out; - } - - /* - * the idea is to trigger a regular read through the standard path. we - * read a page from the (failed) logical address by specifying the - * corresponding copynum of the failed sector. thus, that readpage is - * expected to fail. - * that is the point where on-the-fly error correction will kick in - * (once it's finished) and rewrite the failed sector if a good copy - * can be found. - */ - ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, - scrub_fixup_readpage, fixup, false); - if (ret < 0) { - uncorrectable = 1; - goto out; - } - WARN_ON(ret != 1); - - spin_lock(&sctx->stat_lock); - ++sctx->stat.corrected_errors; - spin_unlock(&sctx->stat_lock); - -out: - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (uncorrectable) { - spin_lock(&sctx->stat_lock); - ++sctx->stat.uncorrectable_errors; - spin_unlock(&sctx->stat_lock); - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_uncorrectable_read_errors); - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (nodatasum) error at logical %llu on dev %s", - fixup->logical, rcu_str_deref(fixup->dev->name)); - } - - btrfs_free_path(path); - kfree(fixup); - - scrub_pending_trans_workers_dec(sctx); -} - static inline void scrub_get_recover(struct scrub_recover *recover) { refcount_inc(&recover->refs); @@ -1151,11 +874,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) return ret; } - if (sctx->is_dev_replace && !is_metadata && !have_csum) { - sblocks_for_recheck = NULL; - goto nodatasum_case; - } - /* * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was @@ -1268,36 +986,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } - if (!is_metadata && !have_csum) { - struct scrub_fixup_nodatasum *fixup_nodatasum; - - WARN_ON(sctx->is_dev_replace); - -nodatasum_case: - - /* - * !is_metadata and !have_csum, this means that the data - * might not be COWed, that it might be modified - * concurrently. The general strategy to work on the - * commit root does not help in the case when COW is not - * used. - */ - fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); - if (!fixup_nodatasum) - goto did_not_correct_error; - fixup_nodatasum->sctx = sctx; - fixup_nodatasum->dev = dev; - fixup_nodatasum->logical = logical; - fixup_nodatasum->root = fs_info->extent_root; - fixup_nodatasum->mirror_num = failed_mirror_index + 1; - scrub_pending_trans_workers_inc(sctx); - btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, - scrub_fixup_nodatasum, NULL, NULL); - btrfs_queue_work(fs_info->scrub_workers, - &fixup_nodatasum->work); - goto out; - } - /* * now build and submit the bios for the other mirrors, check * checksums. @@ -1865,7 +1553,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio = btrfs_io_bio_alloc(1); bio_set_dev(bio, page_bad->dev->bdev); bio->bi_iter.bi_sector = page_bad->physical >> 9; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1960,7 +1648,7 @@ again: bio->bi_end_io = scrub_wr_bio_end_io; bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE; sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -2360,7 +2048,7 @@ again: bio->bi_end_io = scrub_bio_end_io; bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2799,17 +2487,10 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, have_csum = scrub_find_csum(sctx, logical, csum); if (have_csum == 0) ++sctx->stat.no_csum; - if (0 && sctx->is_dev_replace && !have_csum) { - ret = copy_nocow_pages(sctx, logical, l, - mirror_num, - physical_for_dev_replace); - goto behind_scrub_pages; - } } ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL, 0, physical_for_dev_replace); -behind_scrub_pages: if (ret) return ret; len -= l; @@ -3862,7 +3543,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * -> btrfs_scrub_pause() */ scrub_pause_on(fs_info); - ret = btrfs_inc_block_group_ro(fs_info, cache); + ret = btrfs_inc_block_group_ro(cache); if (!ret && is_dev_replace) { /* * If we are doing a device replace wait for any tasks @@ -3981,14 +3662,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache->removed && !cache->ro && cache->reserved == 0 && btrfs_block_group_used(&cache->item) == 0) { spin_unlock(&cache->lock); - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&cache->bg_list)) { - btrfs_get_block_group(cache); - trace_btrfs_add_unused_block_group(cache); - list_add_tail(&cache->bg_list, - &fs_info->unused_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); + btrfs_mark_bg_unused(cache); } else { spin_unlock(&cache->lock); } @@ -4071,10 +3745,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; - fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); - if (!fs_info->scrub_nocow_workers) - goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); @@ -4085,8 +3755,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, return 0; fail_scrub_parity_workers: - btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); -fail_scrub_nocow_workers: btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); fail_scrub_wr_completion_workers: btrfs_destroy_workqueue(fs_info->scrub_workers); @@ -4099,7 +3767,6 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) if (--fs_info->scrub_workers_refcnt == 0) { btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); - btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); @@ -4112,7 +3779,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; - struct rcu_string *name; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -4166,11 +3832,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - rcu_read_lock(); - name = rcu_dereference(dev->name); - btrfs_err(fs_info, "scrub: device %s is not writable", - name->str); - rcu_read_unlock(); + btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", + rcu_str_deref(dev->name)); return -EROFS; } @@ -4358,330 +4021,3 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, *extent_dev = bbio->stripes[0].dev; btrfs_put_bbio(bbio); } - -static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - int mirror_num, u64 physical_for_dev_replace) -{ - struct scrub_copy_nocow_ctx *nocow_ctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - - nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); - if (!nocow_ctx) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } - - scrub_pending_trans_workers_inc(sctx); - - nocow_ctx->sctx = sctx; - nocow_ctx->logical = logical; - nocow_ctx->len = len; - nocow_ctx->mirror_num = mirror_num; - nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, - copy_nocow_pages_worker, NULL, NULL); - INIT_LIST_HEAD(&nocow_ctx->inodes); - btrfs_queue_work(fs_info->scrub_nocow_workers, - &nocow_ctx->work); - - return 0; -} - -static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct scrub_copy_nocow_ctx *nocow_ctx = ctx; - struct scrub_nocow_inode *nocow_inode; - - nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); - if (!nocow_inode) - return -ENOMEM; - nocow_inode->inum = inum; - nocow_inode->offset = offset; - nocow_inode->root = root; - list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); - return 0; -} - -#define COPY_COMPLETE 1 - -static void copy_nocow_pages_worker(struct btrfs_work *work) -{ - struct scrub_copy_nocow_ctx *nocow_ctx = - container_of(work, struct scrub_copy_nocow_ctx, work); - struct scrub_ctx *sctx = nocow_ctx->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - u64 logical = nocow_ctx->logical; - u64 len = nocow_ctx->len; - int mirror_num = nocow_ctx->mirror_num; - u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; - int ret; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; - int not_written = 0; - - path = btrfs_alloc_path(); - if (!path) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - not_written = 1; - goto out; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - not_written = 1; - goto out; - } - - ret = iterate_inodes_from_logical(logical, fs_info, path, - record_inode_for_nocow, nocow_ctx, false); - if (ret != 0 && ret != -ENOENT) { - btrfs_warn(fs_info, - "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", - logical, physical_for_dev_replace, len, mirror_num, - ret); - not_written = 1; - goto out; - } - - btrfs_end_transaction(trans); - trans = NULL; - while (!list_empty(&nocow_ctx->inodes)) { - struct scrub_nocow_inode *entry; - entry = list_first_entry(&nocow_ctx->inodes, - struct scrub_nocow_inode, - list); - list_del_init(&entry->list); - ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, - entry->root, nocow_ctx); - kfree(entry); - if (ret == COPY_COMPLETE) { - ret = 0; - break; - } else if (ret) { - break; - } - } -out: - while (!list_empty(&nocow_ctx->inodes)) { - struct scrub_nocow_inode *entry; - entry = list_first_entry(&nocow_ctx->inodes, - struct scrub_nocow_inode, - list); - list_del_init(&entry->list); - kfree(entry); - } - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (not_written) - btrfs_dev_replace_stats_inc(&fs_info->dev_replace. - num_uncorrectable_read_errors); - - btrfs_free_path(path); - kfree(nocow_ctx); - - scrub_pending_trans_workers_dec(sctx); -} - -static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, - u64 logical) -{ - struct extent_state *cached_state = NULL; - struct btrfs_ordered_extent *ordered; - struct extent_io_tree *io_tree; - struct extent_map *em; - u64 lockstart = start, lockend = start + len - 1; - int ret = 0; - - io_tree = &inode->io_tree; - - lock_extent_bits(io_tree, lockstart, lockend, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, lockstart, len); - if (ordered) { - btrfs_put_ordered_extent(ordered); - ret = 1; - goto out_unlock; - } - - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock; - } - - /* - * This extent does not actually cover the logical extent anymore, - * move on to the next inode. - */ - if (em->block_start > logical || - em->block_start + em->block_len < logical + len || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - free_extent_map(em); - ret = 1; - goto out_unlock; - } - free_extent_map(em); - -out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state); - return ret; -} - -static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, - struct scrub_copy_nocow_ctx *nocow_ctx) -{ - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info; - struct btrfs_key key; - struct inode *inode; - struct page *page; - struct btrfs_root *local_root; - struct extent_io_tree *io_tree; - u64 physical_for_dev_replace; - u64 nocow_ctx_logical; - u64 len = nocow_ctx->len; - unsigned long index; - int srcu_index; - int ret = 0; - int err = 0; - - key.objectid = root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - srcu_index = srcu_read_lock(&fs_info->subvol_srcu); - - local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); - return PTR_ERR(local_root); - } - - key.type = BTRFS_INODE_ITEM_KEY; - key.objectid = inum; - key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); - srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - /* Avoid truncate/dio/punch hole.. */ - inode_lock(inode); - inode_dio_wait(inode); - - physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; - io_tree = &BTRFS_I(inode)->io_tree; - nocow_ctx_logical = nocow_ctx->logical; - - ret = check_extent_to_block(BTRFS_I(inode), offset, len, - nocow_ctx_logical); - if (ret) { - ret = ret > 0 ? 0 : ret; - goto out; - } - - while (len >= PAGE_SIZE) { - index = offset >> PAGE_SHIFT; -again: - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) { - btrfs_err(fs_info, "find_or_create_page() failed"); - ret = -ENOMEM; - goto out; - } - - if (PageUptodate(page)) { - if (PageDirty(page)) - goto next_page; - } else { - ClearPageError(page); - err = extent_read_full_page(io_tree, page, - btrfs_get_extent, - nocow_ctx->mirror_num); - if (err) { - ret = err; - goto next_page; - } - - lock_page(page); - /* - * If the page has been remove from the page cache, - * the data on it is meaningless, because it may be - * old one, the new data may be written into the new - * page in the page cache. - */ - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; - } - if (!PageUptodate(page)) { - ret = -EIO; - goto next_page; - } - } - - ret = check_extent_to_block(BTRFS_I(inode), offset, len, - nocow_ctx_logical); - if (ret) { - ret = ret > 0 ? 0 : ret; - goto next_page; - } - - err = write_page_nocow(nocow_ctx->sctx, - physical_for_dev_replace, page); - if (err) - ret = err; -next_page: - unlock_page(page); - put_page(page); - - if (ret) - break; - - offset += PAGE_SIZE; - physical_for_dev_replace += PAGE_SIZE; - nocow_ctx_logical += PAGE_SIZE; - len -= PAGE_SIZE; - } - ret = COPY_COMPLETE; -out: - inode_unlock(inode); - iput(inode); - return ret; -} - -static int write_page_nocow(struct scrub_ctx *sctx, - u64 physical_for_dev_replace, struct page *page) -{ - struct bio *bio; - struct btrfs_device *dev; - - dev = sctx->wr_tgtdev; - if (!dev) - return -EIO; - if (!dev->bdev) { - btrfs_warn_rl(dev->fs_info, - "scrub write_page_nocow(bdev == NULL) is unexpected"); - return -EIO; - } - bio = btrfs_io_bio_alloc(1); - bio->bi_iter.bi_size = 0; - bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - /* bio_add_page won't fail on a freshly allocated bio */ - bio_add_page(bio, page, PAGE_SIZE, 0); - - if (btrfsic_submit_bio_wait(bio)) { - bio_put(bio); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; - } - - bio_put(bio); - return 0; -} diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c47f62b19226..ba8950bfd9c7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -100,6 +100,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + bool ignore_cur_inode; u64 send_progress; @@ -1500,7 +1501,7 @@ static int read_symlink(struct btrfs_root *root, BUG_ON(compression); off = btrfs_file_extent_inline_start(ei); - len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); + len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); @@ -5006,6 +5007,15 @@ static int send_hole(struct send_ctx *sctx, u64 end) u64 len; int ret = 0; + /* + * A hole that starts at EOF or beyond it. Since we do not yet support + * fallocate (for extent preallocation and hole punching), sending a + * write of zeroes starting at EOF or beyond would later require issuing + * a truncate operation which would undo the write and achieve nothing. + */ + if (offset >= sctx->cur_inode_size) + return 0; + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); @@ -5160,7 +5170,7 @@ static int clone_range(struct send_ctx *sctx, ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, ei); if (type == BTRFS_FILE_EXTENT_INLINE) { - ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); + ext_len = btrfs_file_extent_ram_bytes(leaf, ei); ext_len = PAGE_ALIGN(ext_len); } else { ext_len = btrfs_file_extent_num_bytes(leaf, ei); @@ -5236,8 +5246,7 @@ static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(path->nodes[0], - path->slots[0], ei); + len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); /* * it is possible the inline item won't cover the whole page, * but there may be items after this page. Make @@ -5375,7 +5384,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, } if (right_type == BTRFS_FILE_EXTENT_INLINE) { - right_len = btrfs_file_extent_inline_len(eb, slot, ei); + right_len = btrfs_file_extent_ram_bytes(eb, ei); right_len = PAGE_ALIGN(right_len); } else { right_len = btrfs_file_extent_num_bytes(eb, ei); @@ -5496,8 +5505,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], fi); if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_inline_len(path->nodes[0], - path->slots[0], fi); + u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); extent_end = ALIGN(key.offset + size, sctx->send_root->fs_info->sectorsize); } else { @@ -5560,7 +5568,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_inline_len(leaf, slot, fi); + u64 size = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = ALIGN(key.offset + size, root->fs_info->sectorsize); @@ -5606,8 +5614,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], fi); if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_inline_len(path->nodes[0], - path->slots[0], fi); + u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); extent_end = ALIGN(key->offset + size, sctx->send_root->fs_info->sectorsize); } else { @@ -5799,6 +5806,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) int pending_move = 0; int refs_processed = 0; + if (sctx->ignore_cur_inode) + return 0; + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, &refs_processed); if (ret < 0) @@ -5917,6 +5927,93 @@ out: return ret; } +struct parent_paths_ctx { + struct list_head *refs; + struct send_ctx *sctx; +}; + +static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, + void *ctx) +{ + struct parent_paths_ctx *ppctx = ctx; + + return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, + ppctx->refs); +} + +/* + * Issue unlink operations for all paths of the current inode found in the + * parent snapshot. + */ +static int btrfs_unlink_all_paths(struct send_ctx *sctx) +{ + LIST_HEAD(deleted_refs); + struct btrfs_path *path; + struct btrfs_key key; + struct parent_paths_ctx ctx; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + ctx.refs = &deleted_refs; + ctx.sctx = sctx; + + while (true) { + struct extent_buffer *eb = path->nodes[0]; + int slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->parent_root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.objectid != sctx->cur_ino) + break; + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + break; + + ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, + record_parent_ref, &ctx); + if (ret < 0) + goto out; + + path->slots[0]++; + } + + while (!list_empty(&deleted_refs)) { + struct recorded_ref *ref; + + ref = list_first_entry(&deleted_refs, struct recorded_ref, list); + ret = send_unlink(sctx, ref->full_path); + if (ret < 0) + goto out; + fs_path_free(ref->full_path); + list_del(&ref->list); + kfree(ref); + } + ret = 0; +out: + btrfs_free_path(path); + if (ret) + __free_recorded_refs(&deleted_refs); + return ret; +} + static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -5931,6 +6028,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_new_gen = 0; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; + sctx->ignore_cur_inode = false; /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -5971,6 +6069,33 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_new_gen = 1; } + /* + * Normally we do not find inodes with a link count of zero (orphans) + * because the most common case is to create a snapshot and use it + * for a send operation. However other less common use cases involve + * using a subvolume and send it after turning it to RO mode just + * after deleting all hard links of a file while holding an open + * file descriptor against it or turning a RO snapshot into RW mode, + * keep an open file descriptor against a file, delete it and then + * turn the snapshot back to RO mode before using it for a send + * operation. So if we find such cases, ignore the inode and all its + * items completely if it's a new inode, or if it's a changed inode + * make sure all its previous paths (from the parent snapshot) are all + * unlinked and all other the inode items are ignored. + */ + if (result == BTRFS_COMPARE_TREE_NEW || + result == BTRFS_COMPARE_TREE_CHANGED) { + u32 nlinks; + + nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); + if (nlinks == 0) { + sctx->ignore_cur_inode = true; + if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = btrfs_unlink_all_paths(sctx); + goto out; + } + } + if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = 1; @@ -6309,15 +6434,17 @@ static int changed_cb(struct btrfs_path *left_path, key->objectid == BTRFS_FREE_SPACE_OBJECTID) goto out; - if (key->type == BTRFS_INODE_ITEM_KEY) + if (key->type == BTRFS_INODE_ITEM_KEY) { ret = changed_inode(sctx, result); - else if (key->type == BTRFS_INODE_REF_KEY || - key->type == BTRFS_INODE_EXTREF_KEY) - ret = changed_ref(sctx, result); - else if (key->type == BTRFS_XATTR_ITEM_KEY) - ret = changed_xattr(sctx, result); - else if (key->type == BTRFS_EXTENT_DATA_KEY) - ret = changed_extent(sctx, result); + } else if (!sctx->ignore_cur_inode) { + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) + ret = changed_ref(sctx, result); + else if (key->type == BTRFS_XATTR_ITEM_KEY) + ret = changed_xattr(sctx, result); + else if (key->type == BTRFS_EXTENT_DATA_KEY) + ret = changed_extent(sctx, result); + } out: return ret; @@ -6328,7 +6455,6 @@ static int full_send_tree(struct send_ctx *sctx) int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; - struct btrfs_key found_key; struct btrfs_path *path; struct extent_buffer *eb; int slot; @@ -6350,17 +6476,13 @@ static int full_send_tree(struct send_ctx *sctx) while (1) { eb = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + btrfs_item_key_to_cpu(eb, &key, slot); - ret = changed_cb(path, NULL, &found_key, + ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; - key.objectid = found_key.objectid; - key.type = found_key.type; - key.offset = found_key.offset + 1; - ret = btrfs_next_item(send_root, path); if (ret < 0) goto out; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index b7b4acb12833..4c13b737f568 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -3,7 +3,6 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include <linux/highmem.h> #include <asm/unaligned.h> #include "ctree.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 81107ad49f3a..6601c9aa5e35 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -5,7 +5,6 @@ #include <linux/blkdev.h> #include <linux/module.h> -#include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -15,8 +14,6 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> @@ -468,9 +465,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvolrootid: case Opt_device: /* - * These are parsed by btrfs_parse_subvol_options - * and btrfs_parse_early_options - * and can be happily ignored here. + * These are parsed by btrfs_parse_subvol_options or + * btrfs_parse_device_options and can be ignored here. */ break; case Opt_nodatasum: @@ -760,6 +756,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_recovery: btrfs_warn(info, "'recovery' is deprecated, use 'usebackuproot' instead"); + /* fall through */ case Opt_usebackuproot: btrfs_info(info, "trying to use backup root at mount time"); @@ -885,13 +882,16 @@ out: * All other options will be parsed on much later in the mount process and * only when we need to allocate a new super block. */ -static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, struct btrfs_fs_devices **fs_devices) +static int btrfs_parse_device_options(const char *options, fmode_t flags, + void *holder) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + struct btrfs_device *device = NULL; int error = 0; + lockdep_assert_held(&uuid_mutex); + if (!options) return 0; @@ -917,11 +917,13 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, error = -ENOMEM; goto out; } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); + device = btrfs_scan_one_device(device_name, flags, + holder); kfree(device_name); - if (error) + if (IS_ERR(device)) { + error = PTR_ERR(device); goto out; + } } } @@ -935,8 +937,8 @@ out: * * The value is later passed to mount_subvol() */ -static int btrfs_parse_subvol_options(const char *options, fmode_t flags, - char **subvol_name, u64 *subvol_objectid) +static int btrfs_parse_subvol_options(const char *options, char **subvol_name, + u64 *subvol_objectid) { substring_t args[MAX_OPT_ARGS]; char *opts, *orig, *p; @@ -948,7 +950,7 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags, /* * strsep changes the string, duplicate it because - * btrfs_parse_early_options gets called later + * btrfs_parse_device_options gets called later */ opts = kstrdup(options, GFP_KERNEL); if (!opts) @@ -1517,6 +1519,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, { struct block_device *bdev = NULL; struct super_block *s; + struct btrfs_device *device = NULL; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1526,12 +1529,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; - error = btrfs_parse_early_options(data, mode, fs_type, - &fs_devices); - if (error) { - return ERR_PTR(error); - } - security_init_mnt_opts(&new_sec_opts); if (data) { error = parse_security_options(data, &new_sec_opts); @@ -1539,10 +1536,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, return ERR_PTR(error); } - error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); - if (error) - goto error_sec_opts; - /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need @@ -1555,8 +1548,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_sec_opts; } - fs_info->fs_devices = fs_devices; - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); security_init_mnt_opts(&fs_info->security_opts); @@ -1565,7 +1556,25 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_fs_info; } + mutex_lock(&uuid_mutex); + error = btrfs_parse_device_options(data, mode, fs_type); + if (error) { + mutex_unlock(&uuid_mutex); + goto error_fs_info; + } + + device = btrfs_scan_one_device(device_name, mode, fs_type); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + error = PTR_ERR(device); + goto error_fs_info; + } + + fs_devices = device->fs_devices; + fs_info->fs_devices = fs_devices; + error = btrfs_open_devices(fs_devices, mode, fs_type); + mutex_unlock(&uuid_mutex); if (error) goto error_fs_info; @@ -1650,8 +1659,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; - error = btrfs_parse_subvol_options(data, mode, - &subvol_name, &subvol_objectid); + error = btrfs_parse_subvol_options(data, &subvol_name, + &subvol_objectid); if (error) { kfree(subvol_name); return ERR_PTR(error); @@ -2098,14 +2107,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) btrfs_account_ro_block_groups_free_space(found); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - if (!list_empty(&found->block_groups[i])) { - switch (i) { - case BTRFS_RAID_DUP: - case BTRFS_RAID_RAID1: - case BTRFS_RAID_RAID10: - factor = 2; - } - } + if (!list_empty(&found->block_groups[i])) + factor = btrfs_bg_type_to_factor( + btrfs_raid_array[i].bg_flag); } } @@ -2222,7 +2226,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct btrfs_ioctl_vol_args *vol; - struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device = NULL; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) @@ -2234,15 +2238,24 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: - ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_root_fs_type, &fs_devices); + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_root_fs_type); + ret = PTR_ERR_OR_ZERO(device); + mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_DEVICES_READY: - ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_root_fs_type, &fs_devices); - if (ret) + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_root_fs_type); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + ret = PTR_ERR(device); break; - ret = !(fs_devices->num_devices == fs_devices->total_devices); + } + ret = !(device->fs_devices->num_devices == + device->fs_devices->total_devices); + mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_GET_SUPPORTED_FEATURES: ret = btrfs_ioctl_get_supported_features((void __user*)arg); @@ -2290,7 +2303,6 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) struct btrfs_fs_devices *cur_devices; struct btrfs_device *dev, *first_dev = NULL; struct list_head *head; - struct rcu_string *name; /* * Lightweight locking of the devices. We should not need @@ -2314,12 +2326,10 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) cur_devices = cur_devices->seed; } - if (first_dev) { - name = rcu_dereference(first_dev->name); - seq_escape(m, name->str, " \t\n\\"); - } else { + if (first_dev) + seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\"); + else WARN_ON(1); - } rcu_read_unlock(); return 0; } @@ -2331,7 +2341,6 @@ static const struct super_operations btrfs_super_ops = { .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .show_devname = btrfs_show_devname, - .write_inode = btrfs_write_inode, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, .statfs = btrfs_statfs, @@ -2369,7 +2378,7 @@ static __cold void btrfs_interface_exit(void) static void __init btrfs_print_mod_info(void) { - pr_info("Btrfs loaded, crc32c=%s" + static const char options[] = "" #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2382,8 +2391,8 @@ static void __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" #endif - "\n", - crc32c_impl()); + ; + pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); } static int __init init_btrfs_fs(void) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4a4e960c7c66..3717c864ba23 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -7,10 +7,8 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> -#include <linux/buffer_head.h> #include <linux/kobject.h> #include <linux/bug.h> -#include <linux/genhd.h> #include <linux/debugfs.h> #include "ctree.h" diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ace94db09d29..412b910b04cc 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -216,7 +216,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, btrfs_init_dummy_trans(&trans, fs_info); test_msg("qgroup basic add"); - ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); + ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID); if (ret) { test_err("couldn't create a qgroup %d", ret); return ret; @@ -249,8 +249,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, - nodesize, old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); if (ret) { test_err("couldn't account space for a qgroup %d", ret); return ret; @@ -285,8 +285,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, - nodesize, old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); if (ret) { test_err("couldn't account space for a qgroup %d", ret); return -EINVAL; @@ -322,7 +322,7 @@ static int test_multiple_refs(struct btrfs_root *root, * We have BTRFS_FS_TREE_OBJECTID created already from the * previous test. */ - ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); + ret = btrfs_create_qgroup(&trans, BTRFS_FIRST_FREE_OBJECTID); if (ret) { test_err("couldn't create a qgroup %d", ret); return ret; @@ -350,8 +350,8 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, - nodesize, old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); if (ret) { test_err("couldn't account space for a qgroup %d", ret); return ret; @@ -385,8 +385,8 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, - nodesize, old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); if (ret) { test_err("couldn't account space for a qgroup %d", ret); return ret; @@ -426,8 +426,8 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, - nodesize, old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, + new_roots); if (ret) { test_err("couldn't account space for a qgroup %d", ret); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ff5f6c719976..3b84f5015029 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -241,7 +241,7 @@ loop: refcount_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; - cur_trans->start_time = get_seconds(); + cur_trans->start_time = ktime_get_seconds(); memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); @@ -680,7 +680,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); - if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + if (trans == ERR_PTR(-ENOENT)) btrfs_wait_for_commit(root->fs_info, 0); return trans; @@ -1152,7 +1152,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) ret = btrfs_run_dev_replace(trans, fs_info); if (ret) return ret; - ret = btrfs_run_qgroups(trans, fs_info); + ret = btrfs_run_qgroups(trans); if (ret) return ret; @@ -1355,8 +1355,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, goto out; /* Now qgroup are all updated, we can inherit it to new qgroups */ - ret = btrfs_qgroup_inherit(trans, fs_info, - src->root_key.objectid, dst_objectid, + ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, inherit); if (ret < 0) goto out; @@ -1574,7 +1573,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* * insert root back/forward references */ - ret = btrfs_add_root_ref(trans, fs_info, objectid, + ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, dentry->d_name.name, dentry->d_name.len); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 94439482a0ec..4cbb1b55387d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -48,7 +48,7 @@ struct btrfs_transaction { int aborted; struct list_head list; struct extent_io_tree dirty_pages; - unsigned long start_time; + time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; wait_queue_head_t pending_wait; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 8d40e7dd8c30..db835635372f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -19,6 +19,7 @@ #include "tree-checker.h" #include "disk-io.h" #include "compression.h" +#include "volumes.h" /* * Error message should follow the following format: @@ -353,6 +354,102 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, return 0; } +__printf(4, 5) +__cold +static void block_group_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, key.offset, &vaf); + va_end(args); +} + +static int check_block_group_item(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_block_group_item bgi; + u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 flags; + u64 type; + + /* + * Here we don't really care about alignment since extent allocator can + * handle it. We care more about the size, as if one block group is + * larger than maximum size, it's must be some obvious corruption. + */ + if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) { + block_group_err(fs_info, leaf, slot, + "invalid block group size, have %llu expect (0, %llu]", + key->offset, BTRFS_MAX_DATA_CHUNK_SIZE); + return -EUCLEAN; + } + + if (item_size != sizeof(bgi)) { + block_group_err(fs_info, leaf, slot, + "invalid item size, have %u expect %zu", + item_size, sizeof(bgi)); + return -EUCLEAN; + } + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + if (btrfs_block_group_chunk_objectid(&bgi) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID) { + block_group_err(fs_info, leaf, slot, + "invalid block group chunk objectid, have %llu expect %llu", + btrfs_block_group_chunk_objectid(&bgi), + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + + if (btrfs_block_group_used(&bgi) > key->offset) { + block_group_err(fs_info, leaf, slot, + "invalid block group used, have %llu expect [0, %llu)", + btrfs_block_group_used(&bgi), key->offset); + return -EUCLEAN; + } + + flags = btrfs_block_group_flags(&bgi); + if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { + block_group_err(fs_info, leaf, slot, +"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", + flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, + hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); + return -EUCLEAN; + } + + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + if (type != BTRFS_BLOCK_GROUP_DATA && + type != BTRFS_BLOCK_GROUP_METADATA && + type != BTRFS_BLOCK_GROUP_SYSTEM && + type != (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA)) { + block_group_err(fs_info, leaf, slot, +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx", + type, hweight64(type), + BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); + return -EUCLEAN; + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -374,6 +471,9 @@ static int check_leaf_item(struct btrfs_fs_info *fs_info, case BTRFS_XATTR_ITEM_KEY: ret = check_dir_item(fs_info, leaf, key, slot); break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(fs_info, leaf, key, slot); + break; } return ret; } @@ -396,9 +496,22 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, * skip this check for relocation trees. */ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + u64 owner = btrfs_header_owner(leaf); struct btrfs_root *check_root; - key.objectid = btrfs_header_owner(leaf); + /* These trees must never be empty */ + if (owner == BTRFS_ROOT_TREE_OBJECTID || + owner == BTRFS_CHUNK_TREE_OBJECTID || + owner == BTRFS_EXTENT_TREE_OBJECTID || + owner == BTRFS_DEV_TREE_OBJECTID || + owner == BTRFS_FS_TREE_OBJECTID || + owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { + generic_err(fs_info, leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return -EUCLEAN; + } + key.objectid = owner; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f8220ec02036..1650dc44a5e3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -545,12 +545,8 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); - if (IS_ERR(inode)) { + if (IS_ERR(inode)) inode = NULL; - } else if (is_bad_inode(inode)) { - iput(inode); - inode = NULL; - } return inode; } @@ -597,7 +593,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, slot, item); + size = btrfs_file_extent_ram_bytes(eb, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, fs_info->sectorsize); @@ -685,7 +681,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * as the owner of the file extent changed from log tree * (doesn't affect qgroup) to fs/file tree(affects qgroup) */ - ret = btrfs_qgroup_trace_extent(trans, fs_info, + ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), btrfs_file_extent_disk_num_bytes(eb, item), GFP_NOFS); @@ -715,7 +711,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - fs_info, root->root_key.objectid, key->objectid, offset, &ins); if (ret) @@ -1291,6 +1286,46 @@ again: return ret; } +static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, + const u8 ref_type, const char *name, + const int namelen) +{ + struct btrfs_key key; + struct btrfs_path *path; + const u64 parent_id = btrfs_ino(BTRFS_I(dir)); + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = ref_type; + if (key.type == BTRFS_INODE_REF_KEY) + key.offset = parent_id; + else + key.offset = btrfs_extref_hash(parent_id, name, namelen); + + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + if (key.type == BTRFS_INODE_EXTREF_KEY) + ret = btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, + name, namelen, NULL); + else + ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen, NULL); + +out: + btrfs_free_path(path); + return ret; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1400,6 +1435,32 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } } + /* + * If a reference item already exists for this inode + * with the same parent and name, but different index, + * drop it and the corresponding directory index entries + * from the parent before adding the new reference item + * and dir index entries, otherwise we would fail with + * -EEXIST returned from btrfs_add_link() below. + */ + ret = btrfs_inode_ref_exists(inode, dir, key->type, + name, namelen); + if (ret > 0) { + ret = btrfs_unlink_inode(trans, root, + BTRFS_I(dir), + BTRFS_I(inode), + name, namelen); + /* + * If we dropped the link count to 0, bump it so + * that later the iput() on the inode will not + * free it. We will fixup the link count later. + */ + if (!ret && inode->i_nlink == 0) + inc_nlink(inode); + } + if (ret < 0) + goto out; + /* insert our name */ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), @@ -2120,7 +2181,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { + if (!log_di || log_di == ERR_PTR(-ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -2933,7 +2994,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(fs_info, trans)) { ret = -EAGAIN; - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } @@ -2951,7 +3011,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(fs_info, trans); mutex_unlock(&root->log_mutex); goto out; @@ -3002,7 +3061,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } btrfs_wait_tree_log_extents(log, mark); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; @@ -3020,7 +3078,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); - btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -3045,7 +3102,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (btrfs_need_log_full_commit(fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -3058,7 +3114,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { btrfs_set_log_full_commit(fs_info, trans); btrfs_abort_transaction(trans, ret); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3068,11 +3123,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_NEW | EXTENT_DIRTY); if (ret) { btrfs_set_log_full_commit(fs_info, trans); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(fs_info->super_for_commit, log_root_tree->node->start); @@ -3159,14 +3212,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); } - /* - * We may have short-circuited the log tree with the full commit logic - * and left ordered extents on our list, so clear these out to keep us - * from leaking inodes and memory. - */ - btrfs_free_logged_extents(log, 0); - btrfs_free_logged_extents(log, 1); - free_extent_buffer(log->node); kfree(log); } @@ -3756,7 +3801,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; unsigned long src_offset; unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; @@ -3937,9 +3982,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, - src_path->slots[0], - extent); + len = btrfs_file_extent_ram_bytes(src, extent); *last_extent = ALIGN(key.offset + len, fs_info->sectorsize); } else { @@ -4004,7 +4047,7 @@ fill_holes: extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, i, extent); + len = btrfs_file_extent_ram_bytes(src, extent); extent_end = ALIGN(key.offset + len, fs_info->sectorsize); } else { @@ -4078,131 +4121,32 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int wait_ordered_extents(struct btrfs_trans_handle *trans, - struct inode *inode, - struct btrfs_root *root, - const struct extent_map *em, - const struct list_head *logged_list, - bool *ordered_io_error) +static int log_extent_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_root *log_root, + const struct extent_map *em) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *log = root->log_root; - u64 mod_start = em->mod_start; - u64 mod_len = em->mod_len; - const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; u64 csum_offset; u64 csum_len; LIST_HEAD(ordered_sums); int ret = 0; - *ordered_io_error = false; - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if (inode->flags & BTRFS_INODE_NODATASUM || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || em->block_start == EXTENT_MAP_HOLE) return 0; - /* - * Wait far any ordered extent that covers our extent map. If it - * finishes without an error, first check and see if our csums are on - * our outstanding ordered extents. - */ - list_for_each_entry(ordered, logged_list, log_list) { - struct btrfs_ordered_sum *sum; - - if (!mod_len) - break; - - if (ordered->file_offset + ordered->len <= mod_start || - mod_start + mod_len <= ordered->file_offset) - continue; - - if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - const u64 start = ordered->file_offset; - const u64 end = ordered->file_offset + ordered->len - 1; - - WARN_ON(ordered->inode != inode); - filemap_fdatawrite_range(inode->i_mapping, start, end); - } - - wait_event(ordered->wait, - (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || - test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); - - if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { - /* - * Clear the AS_EIO/AS_ENOSPC flags from the inode's - * i_mapping flags, so that the next fsync won't get - * an outdated io error too. - */ - filemap_check_errors(inode->i_mapping); - *ordered_io_error = true; - break; - } - /* - * We are going to copy all the csums on this ordered extent, so - * go ahead and adjust mod_start and mod_len in case this - * ordered extent has already been logged. - */ - if (ordered->file_offset > mod_start) { - if (ordered->file_offset + ordered->len >= - mod_start + mod_len) - mod_len = ordered->file_offset - mod_start; - /* - * If we have this case - * - * |--------- logged extent ---------| - * |----- ordered extent ----| - * - * Just don't mess with mod_start and mod_len, we'll - * just end up logging more csums than we need and it - * will be ok. - */ - } else { - if (ordered->file_offset + ordered->len < - mod_start + mod_len) { - mod_len = (mod_start + mod_len) - - (ordered->file_offset + ordered->len); - mod_start = ordered->file_offset + - ordered->len; - } else { - mod_len = 0; - } - } - - if (skip_csum) - continue; - - /* - * To keep us from looping for the above case of an ordered - * extent that falls inside of the logged extent. - */ - if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, - &ordered->flags)) - continue; - - list_for_each_entry(sum, &ordered->list, list) { - ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) - break; - } - } - - if (*ordered_io_error || !mod_len || ret || skip_csum) - return ret; - + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = mod_start - em->start; - csum_len = mod_len; + csum_offset = em->mod_start - em->start; + csum_len = em->mod_len; } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(fs_info->csum_root, + ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -4214,7 +4158,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = btrfs_csum_file_blocks(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4226,7 +4170,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *root, const struct extent_map *em, struct btrfs_path *path, - const struct list_head *logged_list, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = root->log_root; @@ -4238,18 +4181,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 block_len; int ret; int extent_inserted = 0; - bool ordered_io_err = false; - ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, - logged_list, &ordered_io_err); + ret = log_extent_csums(trans, inode, log, em); if (ret) return ret; - if (ordered_io_err) { - ctx->io_err = -EIO; - return ctx->io_err; - } - btrfs_init_map_token(&token); ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, @@ -4424,7 +4360,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct list_head *logged_list, struct btrfs_log_ctx *ctx, const u64 start, const u64 end) @@ -4480,20 +4415,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); - /* - * Some ordered extents started by fsync might have completed - * before we could collect them into the list logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - ret = filemap_check_errors(inode->vfs_inode.i_mapping); - if (ret) - ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4512,8 +4433,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, logged_list, - ctx); + ret = log_one_extent(trans, inode, root, em, path, ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -4712,9 +4632,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(leaf, - path->slots[0], - extent); + len = btrfs_file_extent_ram_bytes(leaf, extent); ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != @@ -4898,7 +4816,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; int ret; @@ -5094,8 +5011,7 @@ again: * we don't need to do more work nor fallback to * a transaction commit. */ - if (IS_ERR(other_inode) && - PTR_ERR(other_inode) == -ENOENT) { + if (other_inode == ERR_PTR(-ENOENT)) { goto next_key; } else if (IS_ERR(other_inode)) { err = PTR_ERR(other_inode); @@ -5235,7 +5151,7 @@ log_extents: } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx, start, end); + ctx, start, end); if (ret) { err = ret; goto out_unlock; @@ -5286,10 +5202,6 @@ log_extents: inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: - if (unlikely(err)) - btrfs_put_logged_extents(&logged_list); - else - btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&inode->log_mutex); btrfs_free_path(path); @@ -5585,7 +5497,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -6120,7 +6032,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; /* * this will force the logging code to walk the dentry chain diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e034ad9e23b4..da86706123ff 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -8,15 +8,12 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/iocontext.h> -#include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> -#include <asm/div64.h> #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -634,44 +631,48 @@ static void pending_bios_fn(struct btrfs_work *work) * devices. */ static void btrfs_free_stale_devices(const char *path, - struct btrfs_device *skip_dev) + struct btrfs_device *skip_device) { - struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; - struct btrfs_device *dev, *tmp_dev; + struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; + struct btrfs_device *device, *tmp_device; - list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) { - - if (fs_devs->opened) + list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { + mutex_lock(&fs_devices->device_list_mutex); + if (fs_devices->opened) { + mutex_unlock(&fs_devices->device_list_mutex); continue; + } - list_for_each_entry_safe(dev, tmp_dev, - &fs_devs->devices, dev_list) { + list_for_each_entry_safe(device, tmp_device, + &fs_devices->devices, dev_list) { int not_found = 0; - if (skip_dev && skip_dev == dev) + if (skip_device && skip_device == device) continue; - if (path && !dev->name) + if (path && !device->name) continue; rcu_read_lock(); if (path) - not_found = strcmp(rcu_str_deref(dev->name), + not_found = strcmp(rcu_str_deref(device->name), path); rcu_read_unlock(); if (not_found) continue; /* delete the stale device */ - if (fs_devs->num_devices == 1) { - btrfs_sysfs_remove_fsid(fs_devs); - list_del(&fs_devs->fs_list); - free_fs_devices(fs_devs); + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + + if (fs_devices->num_devices == 0) break; - } else { - fs_devs->num_devices--; - list_del(&dev->dev_list); - btrfs_free_device(dev); - } + } + mutex_unlock(&fs_devices->device_list_mutex); + if (fs_devices->num_devices == 0) { + btrfs_sysfs_remove_fsid(fs_devices); + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); } } } @@ -750,7 +751,8 @@ error_brelse: * error pointer when failed */ static noinline struct btrfs_device *device_list_add(const char *path, - struct btrfs_super_block *disk_super) + struct btrfs_super_block *disk_super, + bool *new_device_added) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; @@ -764,21 +766,26 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); + mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); device = NULL; } else { + mutex_lock(&fs_devices->device_list_mutex); device = find_device(fs_devices, devid, disk_super->dev_item.uuid); } if (!device) { - if (fs_devices->opened) + if (fs_devices->opened) { + mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); + } device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid); if (IS_ERR(device)) { + mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } @@ -786,17 +793,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { btrfs_free_device(device); + mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); - mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; - mutex_unlock(&fs_devices->device_list_mutex); device->fs_devices = fs_devices; - btrfs_free_stale_devices(path, device); + *new_device_added = true; if (disk_super->label[0]) pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", @@ -840,12 +846,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, * with larger generation number or the last-in if * generation are equal. */ + mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EEXIST); } name = rcu_string_strdup(path, GFP_NOFS); - if (!name) + if (!name) { + mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); + } rcu_string_free(device->name); rcu_assign_pointer(device->name, name); if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { @@ -865,6 +874,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, fs_devices->total_devices = btrfs_super_num_devices(disk_super); + mutex_unlock(&fs_devices->device_list_mutex); return device; } @@ -1004,7 +1014,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) blkdev_put(device->bdev, device->mode); } -static void btrfs_prepare_close_one_device(struct btrfs_device *device) +static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; struct btrfs_device *new_device; @@ -1022,6 +1032,8 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; + btrfs_close_bdev(device); + new_device = btrfs_alloc_device(NULL, &device->devid, device->uuid); BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ @@ -1035,39 +1047,23 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) list_replace_rcu(&device->dev_list, &new_device->dev_list); new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device_rcu); } static int close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; - struct list_head pending_put; - - INIT_LIST_HEAD(&pending_put); if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { - btrfs_prepare_close_one_device(device); - list_add(&device->dev_list, &pending_put); + btrfs_close_one_device(device); } mutex_unlock(&fs_devices->device_list_mutex); - /* - * btrfs_show_devname() is using the device_list_mutex, - * sometimes call to blkdev_put() leads vfs calling - * into this func. So do put outside of device_list_mutex, - * as of now. - */ - while (!list_empty(&pending_put)) { - device = list_first_entry(&pending_put, - struct btrfs_device, dev_list); - list_del(&device->dev_list); - btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device_rcu); - } - WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; @@ -1146,6 +1142,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, { int ret; + lockdep_assert_held(&uuid_mutex); + mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; @@ -1215,16 +1213,18 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, * and we are not allowed to call set_blocksize during the scan. The superblock * is read via pagecache */ -int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, - struct btrfs_fs_devices **fs_devices_ret) +struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, + void *holder) { struct btrfs_super_block *disk_super; - struct btrfs_device *device; + bool new_device_added = false; + struct btrfs_device *device = NULL; struct block_device *bdev; struct page *page; - int ret = 0; u64 bytenr; + lockdep_assert_held(&uuid_mutex); + /* * we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -1236,112 +1236,25 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) - return PTR_ERR(bdev); + return ERR_CAST(bdev); if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { - ret = -EINVAL; + device = ERR_PTR(-EINVAL); goto error_bdev_put; } - mutex_lock(&uuid_mutex); - device = device_list_add(path, disk_super); - if (IS_ERR(device)) - ret = PTR_ERR(device); - else - *fs_devices_ret = device->fs_devices; - mutex_unlock(&uuid_mutex); + device = device_list_add(path, disk_super, &new_device_added); + if (!IS_ERR(device)) { + if (new_device_added) + btrfs_free_stale_devices(path, device); + } btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); - return ret; -} - -/* helper to account the used device space in the range */ -int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, - u64 end, u64 *length) -{ - struct btrfs_key key; - struct btrfs_root *root = device->fs_info->dev_root; - struct btrfs_dev_extent *dev_extent; - struct btrfs_path *path; - u64 extent_end; - int ret; - int slot; - struct extent_buffer *l; - - *length = 0; - - if (start >= device->total_bytes || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - - key.objectid = device->devid; - key.offset = start; - key.type = BTRFS_DEV_EXTENT_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = btrfs_previous_item(root, path, key.objectid, key.type); - if (ret < 0) - goto out; - } - - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid < device->devid) - goto next; - - if (key.objectid > device->devid) - break; - - if (key.type != BTRFS_DEV_EXTENT_KEY) - goto next; - - dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - extent_end = key.offset + btrfs_dev_extent_length(l, - dev_extent); - if (key.offset <= start && extent_end > end) { - *length = end - start + 1; - break; - } else if (key.offset <= start && extent_end > start) - *length += extent_end - start; - else if (key.offset > start && extent_end <= end) - *length += extent_end - key.offset; - else if (key.offset > start && key.offset <= end) { - *length += end - key.offset + 1; - break; - } else if (key.offset > end) - break; - -next: - path->slots[0]++; - } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return device; } static int contains_pending_extent(struct btrfs_transaction *transaction, @@ -1753,10 +1666,8 @@ error: * the btrfs_device struct should be fully filled in */ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_device *device) { - struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_dev_item *dev_item; @@ -1772,8 +1683,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*dev_item)); + ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, + &key, sizeof(*dev_item)); if (ret) goto out; @@ -1798,7 +1709,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE); + write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1922,9 +1833,10 @@ static struct btrfs_device * btrfs_find_next_active_device( * where this function called, there should be always be another device (or * this_dev) which is active. */ -void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *device, struct btrfs_device *this_dev) +void btrfs_assign_next_active_device(struct btrfs_device *device, + struct btrfs_device *this_dev) { + struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_device *next_device; if (this_dev) @@ -2027,11 +1939,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, cur_devices->num_devices--; cur_devices->total_devices--; + /* Update total_devices of the parent fs_devices if it's seed */ + if (cur_devices != fs_devices) + fs_devices->total_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) cur_devices->missing_devices--; - btrfs_assign_next_active_device(fs_info, device, NULL); + btrfs_assign_next_active_device(device, NULL); if (device->bdev) { cur_devices->open_devices--; @@ -2082,12 +1997,11 @@ error_undo: goto out; } -void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; - lockdep_assert_held(&fs_info->fs_devices->device_list_mutex); + lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); /* * in case of fs with no seed, srcdev->fs_devices will point @@ -2149,10 +2063,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } } -void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *tgtdev) +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; WARN_ON(!tgtdev); mutex_lock(&fs_devices->device_list_mutex); @@ -2164,7 +2077,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, fs_devices->num_devices--; - btrfs_assign_next_active_device(fs_info, tgtdev, NULL); + btrfs_assign_next_active_device(tgtdev, NULL); list_del_rcu(&tgtdev->dev_list); @@ -2295,7 +2208,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) @@ -2315,7 +2228,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) generate_random_uuid(fs_devices->fsid); memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; @@ -2405,15 +2318,16 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct super_block *sb = fs_info->sb; struct rcu_string *name; - u64 tmp; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 orig_super_total_bytes; + u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; bool unlocked = false; - if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) + if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, @@ -2421,7 +2335,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(bdev)) return PTR_ERR(bdev); - if (fs_info->fs_devices->seeding) { + if (fs_devices->seeding) { seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); @@ -2429,18 +2343,16 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path filemap_write_and_wait(bdev->bd_inode->i_mapping); - devices = &fs_info->fs_devices->devices; - - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_for_each_entry(device, devices, dev_list) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; mutex_unlock( - &fs_info->fs_devices->device_list_mutex); + &fs_devices->device_list_mutex); goto error; } } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { @@ -2489,33 +2401,34 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } } - device->fs_devices = fs_info->fs_devices; + device->fs_devices = fs_devices; - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); - list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); - list_add(&device->dev_alloc_list, - &fs_info->fs_devices->alloc_list); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - fs_info->fs_devices->rw_devices++; - fs_info->fs_devices->total_devices++; - fs_info->fs_devices->total_rw_bytes += device->total_bytes; + list_add_rcu(&device->dev_list, &fs_devices->devices); + list_add(&device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->num_devices++; + fs_devices->open_devices++; + fs_devices->rw_devices++; + fs_devices->total_devices++; + fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!blk_queue_nonrot(q)) - fs_info->fs_devices->rotating = 1; + fs_devices->rotating = 1; - tmp = btrfs_super_total_bytes(fs_info->super_copy); + orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, - round_down(tmp + device->total_bytes, fs_info->sectorsize)); + round_down(orig_super_total_bytes + device->total_bytes, + fs_info->sectorsize)); - tmp = btrfs_super_num_devices(fs_info->super_copy); - btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); + orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); + btrfs_set_super_num_devices(fs_info->super_copy, + orig_super_num_devices + 1); /* add sysfs device entry */ - btrfs_sysfs_add_device_link(fs_info->fs_devices, device); + btrfs_sysfs_add_device_link(fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2524,7 +2437,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); @@ -2536,7 +2449,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } } - ret = btrfs_add_dev_item(trans, fs_info, device); + ret = btrfs_add_dev_item(trans, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; @@ -2556,7 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_info->fsid); - if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) + if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_info, "sysfs: failed to create fsid for sprout"); } @@ -2591,7 +2504,23 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(fs_devices, device); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->chunk_mutex); + list_del_rcu(&device->dev_list); + list_del(&device->dev_alloc_list); + fs_info->fs_devices->num_devices--; + fs_info->fs_devices->open_devices--; + fs_info->fs_devices->rw_devices--; + fs_info->fs_devices->total_devices--; + fs_info->fs_devices->total_rw_bytes -= device->total_bytes; + atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); + btrfs_set_super_total_bytes(fs_info->super_copy, + orig_super_total_bytes); + btrfs_set_super_num_devices(fs_info->super_copy, + orig_super_num_devices); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: if (seeding_dev) sb->s_flags |= SB_RDONLY; @@ -2695,9 +2624,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, return btrfs_update_device(trans, device); } -static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 chunk_offset) +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; @@ -2806,9 +2735,9 @@ static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, return em; } -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; @@ -2827,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } map = em->map_lookup; mutex_lock(&fs_info->chunk_mutex); - check_system_chunk(trans, fs_info, map->type); + check_system_chunk(trans, map->type); mutex_unlock(&fs_info->chunk_mutex); /* @@ -2867,7 +2796,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } mutex_unlock(&fs_devices->device_list_mutex); - ret = btrfs_free_chunk(trans, fs_info, chunk_offset); + ret = btrfs_free_chunk(trans, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2883,7 +2812,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } - ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); + ret = btrfs_remove_block_group(trans, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2948,7 +2877,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * step two, delete the device extents and the * chunk tree entries */ - ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); + ret = btrfs_remove_chunk(trans, chunk_offset); btrfs_end_transaction(trans); return ret; } @@ -3057,7 +2986,7 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_force_chunk_alloc(trans, fs_info, + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); btrfs_end_transaction(trans); if (ret < 0) @@ -4690,7 +4619,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; - max_chunk_size = 10 * max_stripe_size; + max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { @@ -4898,7 +4827,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, refcount_inc(&em->refs); write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); + ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); if (ret) goto error_del_extent; @@ -4932,9 +4861,9 @@ error: } int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 chunk_offset, u64 chunk_size) + u64 chunk_offset, u64 chunk_size) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; @@ -5036,13 +4965,12 @@ out: * require modifying the chunk tree. This division is important for the * bootstrap process of adding storage to a seed btrfs. */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 type) +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) { u64 chunk_offset; - lockdep_assert_held(&fs_info->chunk_mutex); - chunk_offset = find_next_chunk(fs_info); + lockdep_assert_held(&trans->fs_info->chunk_mutex); + chunk_offset = find_next_chunk(trans->fs_info); return __btrfs_alloc_chunk(trans, chunk_offset, type); } @@ -5173,7 +5101,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) /* * There could be two corrupted data stripes, we need * to loop retry in order to rebuild the correct data. - * + * * Fail a stripe at a time on every retry except the * stripe under reconstruction. */ @@ -6185,21 +6113,11 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; -#ifdef DEBUG - { - struct rcu_string *name; - - rcu_read_lock(); - name = rcu_dereference(dev->name); - btrfs_debug(fs_info, - "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, - (u64)bio->bi_iter.bi_sector, - (u_long)dev->bdev->bd_dev, name->str, dev->devid, - bio->bi_iter.bi_size); - rcu_read_unlock(); - } -#endif + btrfs_debug_in_rcu(fs_info, + "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, + (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, + bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); @@ -6401,6 +6319,8 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, u16 num_stripes; u16 sub_stripes; u64 type; + u64 features; + bool mixed = false; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); @@ -6439,6 +6359,32 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, btrfs_chunk_type(leaf, chunk)); return -EIO; } + + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); + return -EIO; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(fs_info, + "system chunk with data or metadata type: 0x%llx", type); + return -EIO; + } + + features = btrfs_super_incompat_flags(fs_info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = true; + + if (!mixed) { + if ((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA)) { + btrfs_err(fs_info, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EIO; + } + } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || @@ -6525,6 +6471,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = btrfs_chunk_type(leaf, chunk); map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + map->verified_stripes = 0; for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6561,10 +6508,14 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, write_lock(&map_tree->map_tree.lock); ret = add_extent_mapping(&map_tree->map_tree, em, 0); write_unlock(&map_tree->map_tree.lock); - BUG_ON(ret); /* Tree corruption */ + if (ret < 0) { + btrfs_err(fs_info, + "failed to add chunk map, start=%llu len=%llu: %d", + em->start, em->len, ret); + } free_extent_map(em); - return 0; + return ret; } static void fill_device_from_item(struct extent_buffer *leaf, @@ -7106,9 +7057,9 @@ out: } static int update_dev_stat_item(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_device *device) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_path *path; struct btrfs_key key; @@ -7201,7 +7152,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, */ smp_rmb(); - ret = update_dev_stat_item(trans, fs_info, device); + ret = update_dev_stat_item(trans, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); } @@ -7380,3 +7331,197 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } + +/* + * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. + */ +int btrfs_bg_type_to_factor(u64 flags) +{ + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return 2; + return 1; +} + + +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +{ + int index = btrfs_bg_flags_to_raid_index(type); + int ncopies = btrfs_raid_array[index].ncopies; + int data_stripes; + + switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID5: + data_stripes = num_stripes - 1; + break; + case BTRFS_BLOCK_GROUP_RAID6: + data_stripes = num_stripes - 2; + break; + default: + data_stripes = num_stripes / ncopies; + break; + } + return div_u64(chunk_len, data_stripes); +} + +static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, + u64 chunk_offset, u64 devid, + u64 physical_offset, u64 physical_len) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 stripe_len; + bool found = false; + int ret = 0; + int i; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_err(fs_info, +"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", + physical_offset, devid); + ret = -EUCLEAN; + goto out; + } + + map = em->map_lookup; + stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); + if (physical_len != stripe_len) { + btrfs_err(fs_info, +"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", + physical_offset, devid, em->start, physical_len, + stripe_len); + ret = -EUCLEAN; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev->devid == devid && + map->stripes[i].physical == physical_offset) { + found = true; + if (map->verified_stripes >= map->num_stripes) { + btrfs_err(fs_info, + "too many dev extents for chunk %llu found", + em->start); + ret = -EUCLEAN; + goto out; + } + map->verified_stripes++; + break; + } + } + if (!found) { + btrfs_err(fs_info, + "dev extent physical offset %llu devid %llu has no corresponding chunk", + physical_offset, devid); + ret = -EUCLEAN; + } +out: + free_extent_map(em); + return ret; +} + +static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct rb_node *node; + int ret = 0; + + read_lock(&em_tree->lock); + for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { + em = rb_entry(node, struct extent_map, rb_node); + if (em->map_lookup->num_stripes != + em->map_lookup->verified_stripes) { + btrfs_err(fs_info, + "chunk %llu has missing dev extent, have %d expect %d", + em->start, em->map_lookup->verified_stripes, + em->map_lookup->num_stripes); + ret = -EUCLEAN; + goto out; + } + } +out: + read_unlock(&em_tree->lock); + return ret; +} + +/* + * Ensure that all dev extents are mapped to correct chunk, otherwise + * later chunk allocation/free would cause unexpected behavior. + * + * NOTE: This will iterate through the whole device tree, which should be of + * the same size level as the chunk tree. This slightly increases mount time. + */ +int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + int ret = 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = READA_FORWARD; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + while (1) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_dev_extent *dext; + int slot = path->slots[0]; + u64 chunk_offset; + u64 physical_offset; + u64 physical_len; + u64 devid; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type != BTRFS_DEV_EXTENT_KEY) + break; + devid = key.objectid; + physical_offset = key.offset; + + dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); + physical_len = btrfs_dev_extent_length(leaf, dext); + + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, + physical_offset, physical_len); + if (ret < 0) + goto out; + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + } + + /* Ensure all chunks have corresponding dev extents */ + ret = verify_chunk_dev_extent_mapping(fs_info); +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5139ec8daf4c..23e9285d88de 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -11,6 +11,8 @@ #include <linux/btrfs.h> #include "async-thread.h" +#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) + extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K @@ -343,6 +345,7 @@ struct map_lookup { u64 stripe_len; int num_stripes; int sub_stripes; + int verified_stripes; /* For mount time dev extent verification */ struct btrfs_bio_stripe stripes[]; }; @@ -382,8 +385,6 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } -int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, - u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -396,20 +397,19 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 type); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); -int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, - struct btrfs_fs_devices **fs_devices_ret); +struct btrfs_device *btrfs_scan_one_device(const char *path, + fmode_t flags, void *holder); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); -void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *device, struct btrfs_device *this_dev); +void btrfs_assign_next_active_device(struct btrfs_device *device, + struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, const char *device_path, struct btrfs_device **device); @@ -453,22 +453,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev); -void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *tgtdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 chunk_offset, u64 chunk_size); -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 chunk_offset); + u64 chunk_offset, u64 chunk_size); +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) @@ -560,4 +556,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); +int btrfs_bg_type_to_factor(u64 flags); +int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); + #endif diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index d9f001078e08..4a717d400807 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -218,7 +218,8 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) "%s", fsdef->dentry->d_sb->s_id); - fscache_object_init(&fsdef->fscache, NULL, &cache->cache); + fscache_object_init(&fsdef->fscache, &fscache_fsdef_index, + &cache->cache); ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); if (ret < 0) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ab0bbe93b398..af2b17b21b94 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -186,12 +186,12 @@ try_again: * need to wait for it to be destroyed */ wait_for_old_object: trace_cachefiles_wait_active(object, dentry, xobject); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); if (fscache_object_is_live(&xobject->fscache)) { pr_err("\n"); pr_err("Error: Unexpected object collision\n"); cachefiles_printk_object(object, xobject); - BUG(); } atomic_inc(&xobject->usage); write_unlock(&cache->active_lock); @@ -248,7 +248,6 @@ wait_for_old_object: goto try_again; requeue: - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo); _leave(" = -ETIMEDOUT"); return -ETIMEDOUT; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 5082c8a49686..40f7595aad10 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -27,6 +27,7 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, struct cachefiles_one_read *monitor = container_of(wait, struct cachefiles_one_read, monitor); struct cachefiles_object *object; + struct fscache_retrieval *op = monitor->op; struct wait_bit_key *key = _key; struct page *page = wait->private; @@ -51,16 +52,22 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ - ASSERT(monitor->op); + ASSERT(op); - object = container_of(monitor->op->op.object, - struct cachefiles_object, fscache); + /* We need to temporarily bump the usage count as we don't own a ref + * here otherwise cachefiles_read_copier() may free the op between the + * monitor being enqueued on the op->to_do list and the op getting + * enqueued on the work queue. + */ + fscache_get_retrieval(op); + object = container_of(op->op.object, struct cachefiles_object, fscache); spin_lock(&object->work_lock); - list_add_tail(&monitor->op_link, &monitor->op->to_do); + list_add_tail(&monitor->op_link, &op->to_do); spin_unlock(&object->work_lock); - fscache_enqueue_retrieval(monitor->op); + fscache_enqueue_retrieval(op); + fscache_put_retrieval(op); return 0; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ad0bed99b1d5..e2679e8a2535 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -429,8 +429,7 @@ out: * file or symlink, return 1 so the VFS can retry. */ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode, - int *opened) + struct file *file, unsigned flags, umode_t mode) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -507,9 +506,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { ceph_init_inode_acls(d_inode(dentry), &acls); - *opened |= FILE_CREATED; + file->f_mode |= FMODE_CREATED; } - err = finish_open(file, dentry, ceph_open, opened); + err = finish_open(file, dentry, ceph_open); } out_req: if (!req->r_err && req->r_target_inode) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ee764ac352ab..a866be999216 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1135,6 +1135,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); + dput(dn); dn = realdn; /* note realdn contains the error */ goto out; } else if (realdn) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a7077a0c989f..971328b99ede 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1025,8 +1025,7 @@ extern const struct file_operations ceph_file_fops; extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode, - int *opened); + struct file *file, unsigned flags, umode_t mode); extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 116146022aa1..bfe999505815 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -126,6 +126,25 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_putc(m, '\n'); } +static void +cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) +{ + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + + seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed); + seq_puts(m, "\t\tCapabilities: "); + if (iface->rdma_capable) + seq_puts(m, "rdma "); + if (iface->rss_capable) + seq_puts(m, "rss "); + seq_putc(m, '\n'); + if (iface->sockaddr.ss_family == AF_INET) + seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); + else if (iface->sockaddr.ss_family == AF_INET6) + seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp1, *tmp2, *tmp3; @@ -312,6 +331,16 @@ skip_rdma: mid_entry->mid); } spin_unlock(&GlobalMid_Lock); + + spin_lock(&ses->iface_lock); + if (ses->iface_count) + seq_printf(m, "\n\tServer interfaces: %zu\n", + ses->iface_count); + for (j = 0; j < ses->iface_count; j++) { + seq_printf(m, "\t%d)\n", j); + cifs_dump_iface(m, &ses->iface_list[j]); + } + spin_unlock(&ses->iface_lock); } } spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 937251cc61c0..ee2a8ec70056 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -37,7 +37,6 @@ #include <crypto/aead.h> int __cifs_calc_signature(struct smb_rqst *rqst, - int start, struct TCP_Server_Info *server, char *signature, struct shash_desc *shash) { @@ -45,16 +44,27 @@ int __cifs_calc_signature(struct smb_rqst *rqst, int rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; + int is_smb2 = server->vals->header_preamble_size == 0; - for (i = start; i < n_vec; i++) { + /* iov[0] is actual data and not the rfc1002 length for SMB2+ */ + if (is_smb2) { + if (iov[0].iov_len <= 4) + return -EIO; + i = 0; + } else { + if (n_vec < 2 || iov[0].iov_len != 4) + return -EIO; + i = 1; /* skip rfc1002 length */ + } + + for (; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } - if (i == 1 && iov[1].iov_len <= 4) - break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(shash, iov[i].iov_base, iov[i].iov_len); if (rc) { @@ -118,7 +128,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst, return rc; } - return __cifs_calc_signature(rqst, 1, server, signature, + return __cifs_calc_signature(rqst, server, signature, &server->secmech.sdescmd5->shash); } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5f0231803431..f3a78efc3109 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -65,8 +65,7 @@ extern struct inode *cifs_root_iget(struct super_block *); extern int cifs_create(struct inode *, struct dentry *, umode_t, bool excl); extern int cifs_atomic_open(struct inode *, struct dentry *, - struct file *, unsigned, umode_t, - int *); + struct file *, unsigned, umode_t); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1efa2e65bc1a..c923c7854027 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -33,6 +33,9 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ +#define CIFS_PORT 445 +#define RFC1001_PORT 139 + /* * The sizes of various internal tables and strings */ @@ -312,6 +315,10 @@ struct smb_version_operations { /* send echo request */ int (*echo)(struct TCP_Server_Info *); /* create directory */ + int (*posix_mkdir)(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb); int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, struct cifs_sb_info *); /* set info on created directory */ @@ -416,7 +423,7 @@ struct smb_version_operations { void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, bool *); /* create lease context buffer for CREATE request */ - char * (*create_lease_buf)(u8 *, u8); + char * (*create_lease_buf)(u8 *lease_key, u8 oplock); /* parse lease context buffer and return oplock/epoch info */ __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, @@ -838,6 +845,13 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif +struct cifs_server_iface { + size_t speed; + unsigned int rdma_capable : 1; + unsigned int rss_capable : 1; + struct sockaddr_storage sockaddr; +}; + /* * Session structure. One of these for each uid session with a particular host */ @@ -875,6 +889,20 @@ struct cifs_ses { #ifdef CONFIG_CIFS_SMB311 __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ + + /* + * Network interfaces available on the server this session is + * connected to. + * + * Other channels can be opened by connecting and binding this + * session to interfaces from this list. + * + * iface_lock should be taken when accessing any of these fields + */ + spinlock_t iface_lock; + struct cifs_server_iface *iface_list; + size_t iface_count; + unsigned long iface_last_update; /* jiffies */ }; static inline bool @@ -883,6 +911,14 @@ cap_unix(struct cifs_ses *ses) return ses->server->vals->cap_unix & ses->capabilities; } +struct cached_fid { + bool is_valid:1; /* Do we have a useable root fid */ + struct cifs_fid *fid; + struct mutex fid_mutex; + struct cifs_tcon *tcon; + struct work_struct lease_break; +}; + /* * there is one of these for each connection to a resource on a particular * session @@ -987,9 +1023,7 @@ struct cifs_tcon { struct fscache_cookie *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ - bool valid_root_fid:1; /* Do we have a useable root fid */ - struct mutex prfid_mutex; /* prevents reopen race after dead ses*/ - struct cifs_fid *prfid; /* handle to the directory at top of share */ + struct cached_fid crfid; /* Cached root fid */ /* BB add field for back pointer to sb struct(s)? */ }; @@ -1382,6 +1416,7 @@ typedef int (mid_handle_t)(struct TCP_Server_Info *server, /* one of these for every pending CIFS request to the server */ struct mid_q_entry { struct list_head qhead; /* mids waiting on reply from this server */ + struct kref refcount; struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ __u32 pid; /* process id */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4e0d183c3d10..1890f534c88b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -82,6 +82,7 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern void cifs_delete_mid(struct mid_q_entry *mid); +extern void cifs_mid_q_entry_release(struct mid_q_entry *midEntry); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); @@ -112,10 +113,6 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, struct kvec * /* resp vec */); -extern int smb2_send_recv(const unsigned int xid, struct cifs_ses *pses, - struct kvec *pkvec, int nvec_to_send, - int *pbuftype, const int flags, - struct kvec *presp); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, struct smb_hdr *in_buf , @@ -544,7 +541,7 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); -int __cifs_calc_signature(struct smb_rqst *rqst, int start, +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature, struct shash_desc *shash); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, @@ -552,6 +549,7 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); +void smb2_cached_lease_break(struct work_struct *work); int cifs_alloc_hash(const char *name, struct crypto_shash **shash, struct sdesc **sdesc); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 42329b25877d..93408eab92e7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -107,10 +107,10 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) } spin_unlock(&tcon->open_file_lock); - mutex_lock(&tcon->prfid_mutex); - tcon->valid_root_fid = false; - memset(tcon->prfid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->prfid_mutex); + mutex_lock(&tcon->crfid.fid_mutex); + tcon->crfid.is_valid = false; + memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted @@ -157,8 +157,14 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * greater than cifs socket timeout which is 7 seconds */ while (server->tcpStatus == CifsNeedReconnect) { - wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + 10 * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to a received" + " signal by the process\n", __func__); + return -ERESTARTSYS; + } /* are we still trying to reconnect? */ if (server->tcpStatus != CifsNeedReconnect) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 96645a7d8f27..5df2c0698cda 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -57,9 +57,6 @@ #include "smb2proto.h" #include "smbdirect.h" -#define CIFS_PORT 445 -#define RFC1001_PORT 139 - extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -927,6 +924,7 @@ next_pdu: server->pdu_size = next_offset; } + mid_entry = NULL; if (server->ops->is_transform_hdr && server->ops->receive_transform && server->ops->is_transform_hdr(buf)) { @@ -941,8 +939,11 @@ next_pdu: length = mid_entry->receive(server, mid_entry); } - if (length < 0) + if (length < 0) { + if (mid_entry) + cifs_mid_q_entry_release(mid_entry); continue; + } if (server->large_buf) buf = server->bigbuf; @@ -959,6 +960,8 @@ next_pdu: if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); + + cifs_mid_q_entry_release(mid_entry); } else if (server->ops->is_oplock_break && server->ops->is_oplock_break(buf, server)) { cifs_dbg(FYI, "Received oplock break\n"); @@ -3029,8 +3032,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) #ifdef CONFIG_CIFS_SMB311 if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) { - if (ses->server->vals->protocol_id == SMB311_PROT_ID) + if (ses->server->vals->protocol_id == SMB311_PROT_ID) { tcon->posix_extensions = true; + printk_once(KERN_WARNING + "SMB3.11 POSIX Extensions are experimental\n"); + } } #endif /* 311 */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ddae52bd1993..3713d22b95a7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -465,8 +465,7 @@ out_err: int cifs_atomic_open(struct inode *inode, struct dentry *direntry, - struct file *file, unsigned oflags, umode_t mode, - int *opened) + struct file *file, unsigned oflags, umode_t mode) { int rc; unsigned int xid; @@ -539,9 +538,9 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, } if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - *opened |= FILE_CREATED; + file->f_mode |= FMODE_CREATED; - rc = finish_open(file, direntry, generic_file_open, opened); + rc = finish_open(file, direntry, generic_file_open); if (rc) { if (server->ops->close) server->ops->close(xid, tcon, &fid); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f4697f548a39..a2cfb33e85c1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1575,6 +1575,17 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) goto mkdir_out; } + server = tcon->ses->server; + +#ifdef CONFIG_CIFS_SMB311 + if ((server->ops->posix_mkdir) && (tcon->posix_extensions)) { + rc = server->ops->posix_mkdir(xid, inode, mode, tcon, full_path, + cifs_sb); + d_drop(direntry); /* for time being always refresh inode info */ + goto mkdir_out; + } +#endif /* SMB311 */ + if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb, @@ -1583,8 +1594,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) goto mkdir_out; } - server = tcon->ses->server; - if (!server->ops->mkdir) { rc = -ENOSYS; goto mkdir_out; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index af29ade195c0..53e8362cbc4a 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -82,6 +82,7 @@ sesInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); + spin_lock_init(&ret_buf->iface_lock); } return ret_buf; } @@ -102,6 +103,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kzfree(buf_to_free->auth_key.response); + kfree(buf_to_free->iface_list); kzfree(buf_to_free); } @@ -117,8 +119,9 @@ tconInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); spin_lock_init(&ret_buf->open_file_lock); - mutex_init(&ret_buf->prfid_mutex); - ret_buf->prfid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL); + mutex_init(&ret_buf->crfid.fid_mutex); + ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid), + GFP_KERNEL); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -136,7 +139,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free) atomic_dec(&tconInfoAllocCount); kfree(buf_to_free->nativeFileSystem); kzfree(buf_to_free->password); - kfree(buf_to_free->prfid); + kfree(buf_to_free->crfid.fid); kfree(buf_to_free); } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index aff8ce8ba34d..646dcd149de1 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -107,6 +107,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) if (compare_mid(mid->mid, buf) && mid->mid_state == MID_REQUEST_SUBMITTED && le16_to_cpu(mid->command) == buf->Command) { + kref_get(&mid->refcount); spin_unlock(&GlobalMid_Lock); return mid; } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 788412675723..4ed10dd086e6 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -41,7 +41,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, int rc; __le16 *smb2_path; struct smb2_file_all_info *smb2_data = NULL; - __u8 smb2_oplock[17]; + __u8 smb2_oplock; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; @@ -59,12 +59,9 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, } oparms->desired_access |= FILE_READ_ATTRIBUTES; - *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; + smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; - if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) - memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); - - rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL, + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, NULL); if (rc) goto out; @@ -101,7 +98,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, move_smb2_info_to_cifs(buf, smb2_data); } - *oplock = *smb2_oplock; + *oplock = smb2_oplock; out: kfree(smb2_data); kfree(smb2_path); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index e2bec47c6845..3ff7cec2da81 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -454,7 +454,8 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) #ifdef CONFIG_CIFS_SMB311 /* SMB311 POSIX extensions paths do not include leading slash */ else if (cifs_sb_master_tlink(cifs_sb) && - cifs_sb_master_tcon(cifs_sb)->posix_extensions) { + cifs_sb_master_tcon(cifs_sb)->posix_extensions && + (from[0] == '/')) { start_of_path = from + 1; } #endif /* 311 */ @@ -492,10 +493,11 @@ cifs_ses_oplock_break(struct work_struct *work) { struct smb2_lease_break_work *lw = container_of(work, struct smb2_lease_break_work, lease_break); - int rc; + int rc = 0; rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, lw->lease_state); + cifs_dbg(FYI, "Lease release rc %d\n", rc); cifs_put_tlink(lw->tlink); kfree(lw); @@ -561,6 +563,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, open->oplock = lease_state; } + return found; } @@ -603,6 +606,18 @@ smb2_is_valid_lease_break(char *buffer) return true; } spin_unlock(&tcon->open_file_lock); + + if (tcon->crfid.is_valid && + !memcmp(rsp->LeaseKey, + tcon->crfid.fid->lease_key, + SMB2_LEASE_KEY_SIZE)) { + INIT_WORK(&tcon->crfid.lease_break, + smb2_cached_lease_break); + queue_work(cifsiod_wq, + &tcon->crfid.lease_break); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } } } } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b15f5957d645..ea92a38b2f08 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -203,6 +203,7 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) if ((mid->mid == wire_mid) && (mid->mid_state == MID_REQUEST_SUBMITTED) && (mid->command == shdr->Command)) { + kref_get(&mid->refcount); spin_unlock(&GlobalMid_Lock); return mid; } @@ -294,34 +295,191 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) return rsize; } -#ifdef CONFIG_CIFS_STATS2 + +static int +parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, + size_t buf_len, + struct cifs_server_iface **iface_list, + size_t *iface_count) +{ + struct network_interface_info_ioctl_rsp *p; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct iface_info_ipv4 *p4; + struct iface_info_ipv6 *p6; + struct cifs_server_iface *info; + ssize_t bytes_left; + size_t next = 0; + int nb_iface = 0; + int rc = 0; + + *iface_list = NULL; + *iface_count = 0; + + /* + * Fist pass: count and sanity check + */ + + bytes_left = buf_len; + p = buf; + while (bytes_left >= sizeof(*p)) { + nb_iface++; + next = le32_to_cpu(p->Next); + if (!next) { + bytes_left -= sizeof(*p); + break; + } + p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); + bytes_left -= next; + } + + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); + rc = -EINVAL; + goto out; + } + + if (bytes_left || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); + + + /* + * Second pass: extract info to internal structure + */ + + *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); + if (!*iface_list) { + rc = -ENOMEM; + goto out; + } + + info = *iface_list; + bytes_left = buf_len; + p = buf; + while (bytes_left >= sizeof(*p)) { + info->speed = le64_to_cpu(p->LinkSpeed); + info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE); + info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE); + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + switch (p->Family) { + /* + * The kernel and wire socket structures have the same + * layout and use network byte order but make the + * conversion explicit in case either one changes. + */ + case INTERNETWORK: + addr4 = (struct sockaddr_in *)&info->sockaddr; + p4 = (struct iface_info_ipv4 *)p->Buffer; + addr4->sin_family = AF_INET; + memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); + + /* [MS-SMB2] 2.2.32.5.1.1 Clients MUST ignore these */ + addr4->sin_port = cpu_to_be16(CIFS_PORT); + + cifs_dbg(FYI, "%s: ipv4 %pI4\n", __func__, + &addr4->sin_addr); + break; + case INTERNETWORKV6: + addr6 = (struct sockaddr_in6 *)&info->sockaddr; + p6 = (struct iface_info_ipv6 *)p->Buffer; + addr6->sin6_family = AF_INET6; + memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); + + /* [MS-SMB2] 2.2.32.5.1.2 Clients MUST ignore these */ + addr6->sin6_flowinfo = 0; + addr6->sin6_scope_id = 0; + addr6->sin6_port = cpu_to_be16(CIFS_PORT); + + cifs_dbg(FYI, "%s: ipv6 %pI6\n", __func__, + &addr6->sin6_addr); + break; + default: + cifs_dbg(VFS, + "%s: skipping unsupported socket family\n", + __func__); + goto next_iface; + } + + (*iface_count)++; + info++; +next_iface: + next = le32_to_cpu(p->Next); + if (!next) + break; + p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); + bytes_left -= next; + } + + if (!*iface_count) { + rc = -EINVAL; + goto out; + } + +out: + if (rc) { + kfree(*iface_list); + *iface_count = 0; + *iface_list = NULL; + } + return rc; +} + + static int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; - struct network_interface_info_ioctl_rsp *out_buf; + struct network_interface_info_ioctl_rsp *out_buf = NULL; + struct cifs_server_iface *iface_list; + size_t iface_count; + struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); - if (rc != 0) + if (rc != 0) { cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); - else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) { - cifs_dbg(VFS, "server returned bad net interface info buf\n"); - rc = -EINVAL; - } else { - /* Dump info on first interface */ - cifs_dbg(FYI, "Adapter Capability 0x%x\t", - le32_to_cpu(out_buf->Capability)); - cifs_dbg(FYI, "Link Speed %lld\n", - le64_to_cpu(out_buf->LinkSpeed)); + goto out; } + + rc = parse_server_interfaces(out_buf, ret_data_len, + &iface_list, &iface_count); + if (rc) + goto out; + + spin_lock(&ses->iface_lock); + kfree(ses->iface_list); + ses->iface_list = iface_list; + ses->iface_count = iface_count; + ses->iface_last_update = jiffies; + spin_unlock(&ses->iface_lock); + +out: kfree(out_buf); return rc; } -#endif /* STATS2 */ + +void +smb2_cached_lease_break(struct work_struct *work) +{ + struct cached_fid *cfid = container_of(work, + struct cached_fid, lease_break); + mutex_lock(&cfid->fid_mutex); + if (cfid->is_valid) { + cifs_dbg(FYI, "clear cached root file handle\n"); + SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, + cfid->fid->volatile_fid); + cfid->is_valid = false; + } + mutex_unlock(&cfid->fid_mutex); +} /* * Open the directory at the root of a share @@ -331,13 +489,13 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) struct cifs_open_parms oparams; int rc; __le16 srch_path = 0; /* Null - since an open of top of share */ - u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + u8 oplock = SMB2_OPLOCK_LEVEL_II; - mutex_lock(&tcon->prfid_mutex); - if (tcon->valid_root_fid) { + mutex_lock(&tcon->crfid.fid_mutex); + if (tcon->crfid.is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); - memcpy(pfid, tcon->prfid, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->prfid_mutex); + memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); return 0; } @@ -350,10 +508,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); if (rc == 0) { - memcpy(tcon->prfid, pfid, sizeof(struct cifs_fid)); - tcon->valid_root_fid = true; + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; } - mutex_unlock(&tcon->prfid_mutex); + mutex_unlock(&tcon->crfid.fid_mutex); return rc; } @@ -383,9 +542,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) if (rc) return; -#ifdef CONFIG_CIFS_STATS2 SMB3_request_interfaces(xid, tcon); -#endif /* STATS2 */ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); @@ -436,7 +593,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; - if ((*full_path == 0) && tcon->valid_root_fid) + if ((*full_path == 0) && tcon->crfid.is_valid) return 0; utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); @@ -699,6 +856,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea, len); + kfree(ea); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); return rc; @@ -2063,8 +2222,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) if (!buf) return NULL; - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); - buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); buf->ccontext.DataOffset = cpu_to_le16(offsetof @@ -2090,8 +2248,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) if (!buf) return NULL; - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); - buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); buf->ccontext.DataOffset = cpu_to_le16(offsetof @@ -2128,8 +2285,7 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) return SMB2_OPLOCK_LEVEL_NOCHANGE; if (lease_key) - memcpy(lease_key, &lc->lcontext.LeaseKeyLow, - SMB2_LEASE_KEY_SIZE); + memcpy(lease_key, &lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); return le32_to_cpu(lc->lcontext.LeaseState); } @@ -2151,7 +2307,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, struct smb_rqst *old_rq) { struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -2171,14 +2327,13 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, } /* Assumes: - * rqst->rq_iov[0] is rfc1002 length - * rqst->rq_iov[1] is tranform header - * rqst->rq_iov[2+] data to be encrypted/decrypted + * rqst->rq_iov[0] is transform header + * rqst->rq_iov[1+] data to be encrypted/decrypted */ static struct scatterlist * init_sg(struct smb_rqst *rqst, u8 *sign) { - unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages; + unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; struct scatterlist *sg; unsigned int i; @@ -2189,10 +2344,10 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return NULL; sg_init_table(sg, sg_len); - smb2_sg_set_buf(&sg[0], rqst->rq_iov[1].iov_base + 20, assoc_data_len); - for (i = 1; i < rqst->rq_nvec - 1; i++) - smb2_sg_set_buf(&sg[i], rqst->rq_iov[i+1].iov_base, - rqst->rq_iov[i+1].iov_len); + smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 20, assoc_data_len); + for (i = 1; i < rqst->rq_nvec; i++) + smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); for (j = 0; i < sg_len - 1; i++, j++) { unsigned int len, offset; @@ -2224,18 +2379,17 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) return 1; } /* - * Encrypt or decrypt @rqst message. @rqst has the following format: - * iov[0] - rfc1002 length - * iov[1] - transform header (associate data), - * iov[2-N] and pages - data to encrypt. - * On success return encrypted data in iov[2-N] and pages, leave iov[0-1] + * Encrypt or decrypt @rqst message. @rqst[0] has the following format: + * iov[0] - transform header (associate data), + * iov[1-N] - SMB2 header and pages - data to encrypt. + * On success return encrypted data in iov[1-N] and pages, leave iov[0] * untouched. */ static int crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) { struct smb2_transform_hdr *tr_hdr = - (struct smb2_transform_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc = 0; struct scatterlist *sg; @@ -2323,10 +2477,6 @@ free_req: return rc; } -/* - * This is called from smb_send_rqst. At this point we have the rfc1002 - * header as the first element in the vector. - */ static int smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct smb_rqst *old_rq) @@ -2335,7 +2485,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct page **pages; struct smb2_transform_hdr *tr_hdr; unsigned int npages = old_rq->rq_npages; - unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base); + unsigned int orig_len; int i; int rc = -ENOMEM; @@ -2355,18 +2505,14 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, goto err_free_pages; } - /* Make space for one extra iov to hold the transform header */ iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec), GFP_KERNEL); if (!iov) goto err_free_pages; - /* copy all iovs from the old except the 1st one (rfc1002 length) */ - memcpy(&iov[2], &old_rq->rq_iov[1], - sizeof(struct kvec) * (old_rq->rq_nvec - 1)); - /* copy the rfc1002 iov */ - iov[0].iov_base = old_rq->rq_iov[0].iov_base; - iov[0].iov_len = old_rq->rq_iov[0].iov_len; + /* copy all iovs from the old */ + memcpy(&iov[1], &old_rq->rq_iov[0], + sizeof(struct kvec) * old_rq->rq_nvec); new_rq->rq_iov = iov; new_rq->rq_nvec = old_rq->rq_nvec + 1; @@ -2375,14 +2521,12 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, if (!tr_hdr) goto err_free_iov; + orig_len = smb_rqst_len(server, old_rq); + /* fill the 2nd iov with a transform header */ fill_transform_hdr(tr_hdr, orig_len, old_rq); - new_rq->rq_iov[1].iov_base = tr_hdr; - new_rq->rq_iov[1].iov_len = sizeof(struct smb2_transform_hdr); - - /* Update rfc1002 header */ - inc_rfc1001_len(new_rq->rq_iov[0].iov_base, - sizeof(struct smb2_transform_hdr)); + new_rq->rq_iov[0].iov_base = tr_hdr; + new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr); /* copy pages form the old */ for (i = 0; i < npages; i++) { @@ -2426,7 +2570,7 @@ smb3_free_transform_rq(struct smb_rqst *rqst) put_page(rqst->rq_pages[i]); kfree(rqst->rq_pages); /* free transform header */ - kfree(rqst->rq_iov[1].iov_base); + kfree(rqst->rq_iov[0].iov_base); kfree(rqst->rq_iov); } @@ -2443,19 +2587,17 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct page **pages, unsigned int npages, unsigned int page_data_size) { - struct kvec iov[3]; + struct kvec iov[2]; struct smb_rqst rqst = {NULL}; int rc; - iov[0].iov_base = NULL; - iov[0].iov_len = 0; - iov[1].iov_base = buf; - iov[1].iov_len = sizeof(struct smb2_transform_hdr); - iov[2].iov_base = buf + sizeof(struct smb2_transform_hdr); - iov[2].iov_len = buf_data_size; + iov[0].iov_base = buf; + iov[0].iov_len = sizeof(struct smb2_transform_hdr); + iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); + iov[1].iov_len = buf_data_size; rqst.rq_iov = iov; - rqst.rq_nvec = 3; + rqst.rq_nvec = 2; rqst.rq_pages = pages; rqst.rq_npages = npages; rqst.rq_pagesz = PAGE_SIZE; @@ -2467,7 +2609,7 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, if (rc) return rc; - memmove(buf, iov[2].iov_base, buf_data_size); + memmove(buf, iov[1].iov_base, buf_data_size); server->total_read = buf_data_size + page_data_size; @@ -3170,6 +3312,7 @@ struct smb_version_operations smb311_operations = { .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, + .posix_mkdir = smb311_posix_mkdir, .rmdir = smb2_rmdir, .unlink = smb2_unlink, .rename = smb2_rename_path, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index af032e1a3eac..3c92678cb45b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -155,7 +155,7 @@ out: static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) { - int rc = 0; + int rc; struct nls_table *nls_codepage; struct cifs_ses *ses; struct TCP_Server_Info *server; @@ -166,10 +166,10 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) * for those three - in the calling routine. */ if (tcon == NULL) - return rc; + return 0; if (smb2_command == SMB2_TREE_CONNECT) - return rc; + return 0; if (tcon->tidStatus == CifsExiting) { /* @@ -212,8 +212,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) return -EAGAIN; } - wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + 10 * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to a received" + " signal by the process\n", __func__); + return -ERESTARTSYS; + } /* are we still trying to reconnect? */ if (server->tcpStatus != CifsNeedReconnect) @@ -231,7 +237,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } if (!tcon->ses->need_reconnect && !tcon->need_reconnect) - return rc; + return 0; nls_codepage = load_nls_default(); @@ -340,7 +346,10 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; /* BB eventually switch this to SMB2 specific small buf size */ - *request_buf = cifs_small_buf_get(); + if (smb2_command == SMB2_SET_INFO) + *request_buf = cifs_buf_get(); + else + *request_buf = cifs_small_buf_get(); if (*request_buf == NULL) { /* BB should we add a retry in here if not a writepage? */ return -ENOMEM; @@ -602,6 +611,7 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req, int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) { + struct smb_rqst rqst; struct smb2_negotiate_req *req; struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; @@ -673,7 +683,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* @@ -990,8 +1004,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) req->PreviousSessionId = sess_data->previous_session; req->Flags = 0; /* MBZ */ - /* to enable echos and oplocks */ - req->sync_hdr.CreditRequest = cpu_to_le16(3); + + /* enough to enable echos and oplocks and one max size write */ + req->sync_hdr.CreditRequest = cpu_to_le16(130); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -1027,6 +1042,7 @@ static int SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) { int rc; + struct smb_rqst rqst; struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; struct kvec rsp_iov = { NULL, 0 }; @@ -1035,10 +1051,13 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - /* BB add code to build os and lm fields */ + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = sess_data->iov; + rqst.rq_nvec = 2; - rc = smb2_send_recv(sess_data->xid, sess_data->ses, - sess_data->iov, 2, + /* BB add code to build os and lm fields */ + rc = cifs_send_recv(sess_data->xid, sess_data->ses, + &rqst, &sess_data->buf0_type, CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); @@ -1376,6 +1395,7 @@ out: int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) { + struct smb_rqst rqst; struct smb2_logoff_req *req; /* response is also trivial struct */ int rc = 0; struct TCP_Server_Info *server; @@ -1413,7 +1433,11 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); /* * No tcon so can't do @@ -1443,6 +1467,7 @@ int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, const struct nls_table *cp) { + struct smb_rqst rqst; struct smb2_tree_connect_req *req; struct smb2_tree_connect_rsp *rsp = NULL; struct kvec iov[2]; @@ -1499,7 +1524,11 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, !smb3_encryption_required(tcon)) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1563,6 +1592,7 @@ tcon_error_exit: int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) { + struct smb_rqst rqst; struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; @@ -1593,7 +1623,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -1682,12 +1716,12 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, static int add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int *num_iovec, __u8 *oplock) + unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) { struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; @@ -1886,11 +1920,165 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, return 0; } +#ifdef CONFIG_CIFS_SMB311 +int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb) +{ + struct smb_rqst rqst; + struct smb2_create_req *req; + struct smb2_create_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[3]; /* make sure at least one for each open context */ + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype; + int uni_path_len; + __le16 *copy_path = NULL; + int copy_size; + int rc = 0; + unsigned int n_iov = 2; + __u32 file_attributes = 0; + char *pc_buf = NULL; + int flags = 0; + unsigned int total_len; + __le16 *path = cifs_convert_path_to_utf16(full_path, cifs_sb); + + if (!path) + return -ENOMEM; + + cifs_dbg(FYI, "mkdir\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + + if (rc) + return rc; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + + req->ImpersonationLevel = IL_IMPERSONATION; + req->DesiredAccess = cpu_to_le32(FILE_WRITE_ATTRIBUTES); + /* File attributes ignored on open (used in create though) */ + req->FileAttributes = cpu_to_le32(file_attributes); + req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(FILE_CREATE); + req->CreateOptions = cpu_to_le32(CREATE_NOT_FILE); + + iov[0].iov_base = (char *)req; + /* -1 since last byte is buf[0] which is sent below (path) */ + iov[0].iov_len = total_len - 1; + + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); + + /* [MS-SMB2] 2.2.13 NameOffset: + * If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of + * the SMB2 header, the file name includes a prefix that will + * be processed during DFS name normalization as specified in + * section 3.3.5.9. Otherwise, the file name is relative to + * the share that is identified by the TreeId in the SMB2 + * header. + */ + if (tcon->share_flags & SHI1005_FLAGS_DFS) { + int name_len; + + req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + rc = alloc_path_with_tree_prefix(©_path, ©_size, + &name_len, + tcon->treeName, path); + if (rc) { + cifs_small_buf_release(req); + return rc; + } + req->NameLength = cpu_to_le16(name_len * 2); + uni_path_len = copy_size; + path = copy_path; + } else { + uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; + /* MUST set path len (NameLength) to 0 opening root of share */ + req->NameLength = cpu_to_le16(uni_path_len - 2); + if (uni_path_len % 8 != 0) { + copy_size = roundup(uni_path_len, 8); + copy_path = kzalloc(copy_size, GFP_KERNEL); + if (!copy_path) { + cifs_small_buf_release(req); + return -ENOMEM; + } + memcpy((char *)copy_path, (const char *)path, + uni_path_len); + uni_path_len = copy_size; + path = copy_path; + } + } + + iov[1].iov_len = uni_path_len; + iov[1].iov_base = path; + req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_NONE; + + if (tcon->posix_extensions) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + rc = add_posix_context(iov, &n_iov, mode); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + return rc; + } + pc_buf = iov[n_iov-1].iov_base; + } + + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, + &rsp_iov); + + cifs_small_buf_release(req); + rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); + trace_smb3_posix_mkdir_err(xid, tcon->tid, ses->Suid, + CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES, rc); + goto smb311_mkdir_exit; + } else + trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, + ses->Suid, CREATE_NOT_FILE, + FILE_WRITE_ATTRIBUTES); + + SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); + + /* Eventually save off posix specific response info and timestaps */ + +smb311_mkdir_exit: + kfree(copy_path); + kfree(pc_buf); + free_rsp_buf(resp_buftype, rsp); + return rc; + +} +#endif /* SMB311 */ + int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, struct kvec *err_iov, int *buftype) { + struct smb_rqst rqst; struct smb2_create_req *req; struct smb2_create_rsp *rsp; struct TCP_Server_Info *server; @@ -1993,7 +2181,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, *oplock == SMB2_OPLOCK_LEVEL_NONE) req->RequestedOplockLevel = *oplock; else { - rc = add_lease_context(server, iov, &n_iov, oplock); + rc = add_lease_context(server, iov, &n_iov, + oparms->fid->lease_key, oplock); if (rc) { cifs_small_buf_release(req); kfree(copy_path); @@ -2043,7 +2232,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } #endif /* SMB311 */ - rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -2099,6 +2292,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */) { + struct smb_rqst rqst; struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; struct cifs_ses *ses; @@ -2189,7 +2383,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; @@ -2274,6 +2472,7 @@ int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int flags) { + struct smb_rqst rqst; struct smb2_close_req *req; struct smb2_close_rsp *rsp; struct cifs_ses *ses = tcon->ses; @@ -2301,7 +2500,11 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -2387,6 +2590,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, u32 additional_info, size_t output_len, size_t min_len, void **data, u32 *dlen) { + struct smb_rqst rqst; struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov[2]; @@ -2427,7 +2631,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -2594,11 +2802,10 @@ SMB2_echo(struct TCP_Server_Info *server) { struct smb2_echo_req *req; int rc = 0; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; unsigned int total_len; - __be32 rfc1002_marker; cifs_dbg(FYI, "In echo request\n"); @@ -2614,11 +2821,8 @@ SMB2_echo(struct TCP_Server_Info *server) req->sync_hdr.CreditRequest = cpu_to_le16(1); - iov[0].iov_len = 4; - rfc1002_marker = cpu_to_be32(total_len); - iov[0].iov_base = &rfc1002_marker; - iov[1].iov_len = total_len; - iov[1].iov_base = (char *)req; + iov[0].iov_len = total_len; + iov[0].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, server, CIFS_ECHO_OP); @@ -2633,6 +2837,7 @@ int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { + struct smb_rqst rqst; struct smb2_flush_req *req; struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; @@ -2660,7 +2865,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); if (rc != 0) { @@ -2848,10 +3057,9 @@ smb2_async_readv(struct cifs_readdata *rdata) struct smb2_sync_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; struct TCP_Server_Info *server; unsigned int total_len; - __be32 req_len; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -2882,12 +3090,8 @@ smb2_async_readv(struct cifs_readdata *rdata) if (smb3_encryption_required(io_parms.tcon)) flags |= CIFS_TRANSFORM_REQ; - req_len = cpu_to_be32(total_len); - - rdata->iov[0].iov_base = &req_len; - rdata->iov[0].iov_len = sizeof(__be32); - rdata->iov[1].iov_base = buf; - rdata->iov[1].iov_len = total_len; + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = total_len; shdr = (struct smb2_sync_hdr *)buf; @@ -2926,6 +3130,7 @@ int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type) { + struct smb_rqst rqst; int resp_buftype, rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; @@ -2946,7 +3151,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; @@ -3062,10 +3271,9 @@ smb2_async_writev(struct cifs_writedata *wdata, struct smb2_sync_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; - __be32 rfc1002_marker; rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); if (rc) { @@ -3137,15 +3345,11 @@ smb2_async_writev(struct cifs_writedata *wdata, v1->length = cpu_to_le32(wdata->mr->mr->length); } #endif - /* 4 for rfc1002 length field and 1 for Buffer */ - iov[0].iov_len = 4; - rfc1002_marker = cpu_to_be32(total_len - 1 + wdata->bytes); - iov[0].iov_base = &rfc1002_marker; - iov[1].iov_len = total_len - 1; - iov[1].iov_base = (char *)req; + iov[0].iov_len = total_len - 1; + iov[0].iov_base = (char *)req; rqst.rq_iov = iov; - rqst.rq_nvec = 2; + rqst.rq_nvec = 1; rqst.rq_pages = wdata->pages; rqst.rq_offset = wdata->page_offset; rqst.rq_npages = wdata->nr_pages; @@ -3153,7 +3357,7 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_tailsz = wdata->tailsz; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { - iov[1].iov_len += sizeof(struct smbd_buffer_descriptor_v1); + iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); rqst.rq_npages = 0; } #endif @@ -3210,6 +3414,7 @@ int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec) { + struct smb_rqst rqst; int rc = 0; struct smb2_write_req *req = NULL; struct smb2_write_rsp *rsp = NULL; @@ -3251,7 +3456,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - rc = smb2_send_recv(xid, io_parms->tcon->ses, iov, n_vec + 1, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_vec + 1; + + rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; @@ -3323,6 +3532,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf) { + struct smb_rqst rqst; struct smb2_query_directory_req *req; struct smb2_query_directory_rsp *rsp = NULL; struct kvec iov[2]; @@ -3395,7 +3605,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -3454,6 +3668,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, u8 info_type, u32 additional_info, unsigned int num, void **data, unsigned int *size) { + struct smb_rqst rqst; struct smb2_set_info_req *req; struct smb2_set_info_rsp *rsp = NULL; struct kvec *iov; @@ -3509,9 +3724,13 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, iov[i].iov_len = size[i]; } - rc = smb2_send_recv(xid, ses, iov, num, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = num; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); + cifs_buf_release(req); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; if (rc != 0) { @@ -3664,6 +3883,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, __u8 oplock_level) { + struct smb_rqst rqst; int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; @@ -3692,7 +3912,11 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3755,6 +3979,7 @@ int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) { + struct smb_rqst rqst; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; struct kvec rsp_iov; @@ -3773,7 +3998,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3798,6 +4027,7 @@ int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int level) { + struct smb_rqst rqst; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; struct kvec rsp_iov; @@ -3829,7 +4059,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3868,6 +4102,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, const __u32 num_lock, struct smb2_lock_element *buf) { + struct smb_rqst rqst; int rc = 0; struct smb2_lock_req *req = NULL; struct kvec iov[2]; @@ -3900,7 +4135,12 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_len = count; cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = smb2_send_recv(xid, tcon->ses, iov, 2, &resp_buf_type, flags, + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, tcon->ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3934,6 +4174,7 @@ int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state) { + struct smb_rqst rqst; int rc; struct smb2_lease_ack *req = NULL; struct cifs_ses *ses = tcon->ses; @@ -3964,7 +4205,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index a345560001ce..a671adcc44a6 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -678,16 +678,14 @@ struct create_context { #define SMB2_LEASE_KEY_SIZE 16 struct lease_context { - __le64 LeaseKeyLow; - __le64 LeaseKeyHigh; + u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; __le32 LeaseFlags; __le64 LeaseDuration; } __packed; struct lease_context_v2 { - __le64 LeaseKeyLow; - __le64 LeaseKeyHigh; + u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; __le32 LeaseFlags; __le64 LeaseDuration; @@ -851,8 +849,11 @@ struct validate_negotiate_info_rsp { __le16 Dialect; /* Dialect in use for the connection */ } __packed; -#define RSS_CAPABLE 0x00000001 -#define RDMA_CAPABLE 0x00000002 +#define RSS_CAPABLE cpu_to_le32(0x00000001) +#define RDMA_CAPABLE cpu_to_le32(0x00000002) + +#define INTERNETWORK cpu_to_le16(0x0002) +#define INTERNETWORKV6 cpu_to_le16(0x0017) struct network_interface_info_ioctl_rsp { __le32 Next; /* next interface. zero if this is last one */ @@ -860,7 +861,21 @@ struct network_interface_info_ioctl_rsp { __le32 Capability; /* RSS or RDMA Capable */ __le32 Reserved; __le64 LinkSpeed; - char SockAddr_Storage[128]; + __le16 Family; + __u8 Buffer[126]; +} __packed; + +struct iface_info_ipv4 { + __be16 Port; + __be32 IPv4Address; + __be64 Reserved; +} __packed; + +struct iface_info_ipv6 { + __be16 Port; + __be32 FlowInfo; + __u8 IPv6Address[16]; + __be32 ScopeId; } __packed; #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index c84020057bd8..6e6a4f2ec890 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -79,6 +79,10 @@ extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, bool set_alloc); extern int smb2_set_file_info(struct inode *inode, const char *full_path, FILE_BASIC_INFO *buf, const unsigned int xid); +extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb); extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, @@ -109,6 +113,8 @@ extern int smb2_unlock_range(struct cifsFileInfo *cfile, extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); extern void smb2_reconnect_server(struct work_struct *work); extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); +extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, + struct smb_rqst *rqst); /* * SMB2 Worker functions - most of protocol specific implementation details diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 349d5ccf854c..719d55e63d88 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -171,10 +171,10 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - int iov_hdr_index = rqst->rq_nvec > 1 ? 1 : 0; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)iov[iov_hdr_index].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; struct cifs_ses *ses; + struct shash_desc *shash = &server->secmech.sdeschmacsha256->shash; + struct smb_rqst drqst; ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { @@ -192,21 +192,39 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) } rc = crypto_shash_setkey(server->secmech.hmacsha256, - ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(shash); if (rc) { cifs_dbg(VFS, "%s: Could not init sha256", __func__); return rc; } - rc = __cifs_calc_signature(rqst, iov_hdr_index, server, sigptr, - &server->secmech.sdeschmacsha256->shash); + /* + * For SMB2+, __cifs_calc_signature() expects to sign only the actual + * data, that is, iov[0] should not contain a rfc1002 length. + * + * Sign the rfc1002 length prior to passing the data (iov[1-N]) down to + * __cifs_calc_signature(). + */ + drqst = *rqst; + if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { + rc = crypto_shash_update(shash, iov[0].iov_base, + iov[0].iov_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); + return rc; + } + drqst.rq_iov++; + drqst.rq_nvec--; + } + rc = __cifs_calc_signature(&drqst, server, sigptr, shash); if (!rc) memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); @@ -410,14 +428,14 @@ generate_smb311signingkey(struct cifs_ses *ses) int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int rc = 0; + int rc; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - int iov_hdr_index = rqst->rq_nvec > 1 ? 1 : 0; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)iov[iov_hdr_index].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; struct cifs_ses *ses; + struct shash_desc *shash = &server->secmech.sdesccmacaes->shash; + struct smb_rqst drqst; ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { @@ -429,8 +447,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, - ses->smb3signingkey, SMB2_CMACAES_SIZE); - + ses->smb3signingkey, SMB2_CMACAES_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; @@ -441,15 +458,33 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) * so unlike smb2 case we do not have to check here if secmech are * initialized */ - rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash); + rc = crypto_shash_init(shash); if (rc) { cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); return rc; } - rc = __cifs_calc_signature(rqst, iov_hdr_index, server, sigptr, - &server->secmech.sdesccmacaes->shash); + /* + * For SMB2+, __cifs_calc_signature() expects to sign only the actual + * data, that is, iov[0] should not contain a rfc1002 length. + * + * Sign the rfc1002 length prior to passing the data (iov[1-N]) down to + * __cifs_calc_signature(). + */ + drqst = *rqst; + if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { + rc = crypto_shash_update(shash, iov[0].iov_base, + iov[0].iov_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); + return rc; + } + drqst.rq_iov++; + drqst.rq_nvec--; + } + rc = __cifs_calc_signature(&drqst, server, sigptr, shash); if (!rc) memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); @@ -462,7 +497,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; if (!(shdr->Flags & SMB2_FLAGS_SIGNED) || server->tcpStatus == CifsNeedNegotiate) @@ -552,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); + kref_init(&temp->refcount); temp->mid = le64_to_cpu(shdr->MessageId); temp->pid = current->pid; temp->command = shdr->Command; /* Always LE */ @@ -635,7 +671,7 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(ses->server, shdr); @@ -656,7 +692,7 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, shdr); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index e459c97151b3..c55ea4e6201b 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -18,6 +18,7 @@ #include "smbdirect.h" #include "cifs_debug.h" #include "cifsproto.h" +#include "smb2proto.h" static struct smbd_response *get_empty_queue_buffer( struct smbd_connection *info); @@ -2082,12 +2083,13 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) * rqst: the data to write * return value: 0 if successfully write, otherwise error code */ -int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) +int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst) { + struct smbd_connection *info = server->smbd_conn; struct kvec vec; int nvecs; int size; - unsigned int buflen = 0, remaining_data_length; + unsigned int buflen, remaining_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -2111,25 +2113,13 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); return -EINVAL; } - iov = &rqst->rq_iov[1]; - - /* total up iov array first */ - for (i = 0; i < rqst->rq_nvec-1; i++) { - buflen += iov[i].iov_len; - } /* * Add in the page array if there is one. The caller needs to set * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and * ends at page boundary */ - if (rqst->rq_npages) { - if (rqst->rq_npages == 1) - buflen += rqst->rq_tailsz; - else - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) - - rqst->rq_offset + rqst->rq_tailsz; - } + buflen = smb_rqst_len(server, rqst); if (buflen + sizeof(struct smbd_data_transfer) > info->max_fragmented_send_size) { @@ -2139,6 +2129,8 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } + iov = &rqst->rq_iov[1]; + cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); for (i = 0; i < rqst->rq_nvec-1; i++) dump_smb(iov[i].iov_base, iov[i].iov_len); diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index 1e419c21dc60..a11096254f29 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -292,7 +292,7 @@ void smbd_destroy(struct smbd_connection *info); /* Interface for carrying upper layer I/O through send/recv */ int smbd_recv(struct smbd_connection *info, struct msghdr *msg); -int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst); +int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst); enum mr_state { MR_READY, @@ -332,7 +332,7 @@ static inline void *smbd_get_connection( static inline int smbd_reconnect(struct TCP_Server_Info *server) {return -1; } static inline void smbd_destroy(struct smbd_connection *info) {} static inline int smbd_recv(struct smbd_connection *info, struct msghdr *msg) {return -1; } -static inline int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) {return -1; } +static inline int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst) {return -1; } #endif #endif diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 61e74d455d90..67e413f6ee4d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -378,7 +378,7 @@ DEFINE_EVENT(smb3_open_err_class, smb3_##name, \ TP_ARGS(xid, tid, sesid, create_options, desired_access, rc)) DEFINE_SMB3_OPEN_ERR_EVENT(open_err); - +DEFINE_SMB3_OPEN_ERR_EVENT(posix_mkdir_err); DECLARE_EVENT_CLASS(smb3_open_done_class, TP_PROTO(unsigned int xid, @@ -420,6 +420,7 @@ DEFINE_EVENT(smb3_open_done_class, smb3_##name, \ TP_ARGS(xid, fid, tid, sesid, create_options, desired_access)) DEFINE_SMB3_OPEN_DONE_EVENT(open_done); +DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1f1a68f89110..a341ec839c83 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -61,6 +61,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); + kref_init(&temp->refcount); temp->mid = get_mid(smb_buffer); temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); @@ -82,6 +83,21 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) return temp; } +static void _cifs_mid_q_entry_release(struct kref *refcount) +{ + struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry, + refcount); + + mempool_free(mid, cifs_mid_poolp); +} + +void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) +{ + spin_lock(&GlobalMid_Lock); + kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); + spin_unlock(&GlobalMid_Lock); +} + void DeleteMidQEntry(struct mid_q_entry *midEntry) { @@ -110,7 +126,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) } } #endif - mempool_free(midEntry, cifs_mid_poolp); + cifs_mid_q_entry_release(midEntry); } void @@ -201,15 +217,25 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, return 0; } -static unsigned long -rqst_len(struct smb_rqst *rqst) +unsigned long +smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst) { unsigned int i; - struct kvec *iov = rqst->rq_iov; + struct kvec *iov; + int nvec; unsigned long buflen = 0; + if (server->vals->header_preamble_size == 0 && + rqst->rq_nvec >= 2 && rqst->rq_iov[0].iov_len == 4) { + iov = &rqst->rq_iov[1]; + nvec = rqst->rq_nvec - 1; + } else { + iov = rqst->rq_iov; + nvec = rqst->rq_nvec; + } + /* total up iov array first */ - for (i = 0; i < rqst->rq_nvec; i++) + for (i = 0; i < nvec; i++) buflen += iov[i].iov_len; /* @@ -236,70 +262,88 @@ rqst_len(struct smb_rqst *rqst) } static int -__smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) +__smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + struct smb_rqst *rqst) { - int rc; - struct kvec *iov = rqst->rq_iov; - int n_vec = rqst->rq_nvec; - unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); - unsigned long send_length; - unsigned int i; + int rc = 0; + struct kvec *iov; + int n_vec; + unsigned int send_length = 0; + unsigned int i, j; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; int val = 1; + __be32 rfc1002_marker; + if (cifs_rdma_enabled(server) && server->smbd_conn) { - rc = smbd_send(server->smbd_conn, rqst); + rc = smbd_send(server, rqst); goto smbd_done; } if (ssocket == NULL) return -ENOTSOCK; - /* sanity check send length */ - send_length = rqst_len(rqst); - if (send_length != smb_buf_length + 4) { - WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n", - send_length, smb_buf_length); - return -EIO; - } - - if (n_vec < 2) - return -EIO; - - cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); - dump_smb(iov[0].iov_base, iov[0].iov_len); - dump_smb(iov[1].iov_base, iov[1].iov_len); - /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - size = 0; - for (i = 0; i < n_vec; i++) - size += iov[i].iov_len; + for (j = 0; j < num_rqst; j++) + send_length += smb_rqst_len(server, &rqst[j]); + rfc1002_marker = cpu_to_be32(send_length); - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, iov, n_vec, size); + /* Generate a rfc1002 marker for SMB2+ */ + if (server->vals->header_preamble_size == 0) { + struct kvec hiov = { + .iov_base = &rfc1002_marker, + .iov_len = 4 + }; + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov, + 1, 4); + rc = smb_send_kvec(server, &smb_msg, &sent); + if (rc < 0) + goto uncork; - rc = smb_send_kvec(server, &smb_msg, &sent); - if (rc < 0) - goto uncork; + total_len += sent; + send_length += 4; + } - total_len += sent; + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); - /* now walk the page array and send each page in it */ - for (i = 0; i < rqst->rq_npages; i++) { - struct bio_vec bvec; + for (j = 0; j < num_rqst; j++) { + iov = rqst[j].rq_iov; + n_vec = rqst[j].rq_nvec; + + size = 0; + for (i = 0; i < n_vec; i++) { + dump_smb(iov[i].iov_base, iov[i].iov_len); + size += iov[i].iov_len; + } - bvec.bv_page = rqst->rq_pages[i]; - rqst_page_get_length(rqst, i, &bvec.bv_len, &bvec.bv_offset); + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, + iov, n_vec, size); - iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, - &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - break; + goto uncork; total_len += sent; + + /* now walk the page array and send each page in it */ + for (i = 0; i < rqst[j].rq_npages; i++) { + struct bio_vec bvec; + + bvec.bv_page = rqst[j].rq_pages[i]; + rqst_page_get_length(&rqst[j], i, &bvec.bv_len, + &bvec.bv_offset); + + iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + &bvec, 1, bvec.bv_len); + rc = smb_send_kvec(server, &smb_msg, &sent); + if (rc < 0) + break; + + total_len += sent; + } } uncork: @@ -308,9 +352,9 @@ uncork: kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - if ((total_len > 0) && (total_len != smb_buf_length + 4)) { + if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", - smb_buf_length + 4, total_len); + send_length, total_len); /* * If we have only sent part of an SMB then the next SMB could * be taken as the remainder of this one. We need to kill the @@ -335,7 +379,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) int rc; if (!(flags & CIFS_TRANSFORM_REQ)) - return __smb_send_rqst(server, rqst); + return __smb_send_rqst(server, 1, rqst); if (!server->ops->init_transform_rq || !server->ops->free_transform_rq) { @@ -347,7 +391,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) if (rc) return rc; - rc = __smb_send_rqst(server, &cur_rqst); + rc = __smb_send_rqst(server, 1, &cur_rqst); server->ops->free_transform_rq(&cur_rqst); return rc; } @@ -365,7 +409,7 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, iov[1].iov_base = (char *)smb_buffer + 4; iov[1].iov_len = smb_buf_length; - return __smb_send_rqst(server, &rqst); + return __smb_send_rqst(server, 1, &rqst); } static int @@ -730,7 +774,6 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, * to the same server. We may make this configurable later or * use ses->maxReq. */ - rc = wait_for_free_request(ses->server, timeout, optype); if (rc) return rc; @@ -766,8 +809,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) - smb311_update_preauth_hash(ses, rqst->rq_iov+1, - rqst->rq_nvec-1); + smb311_update_preauth_hash(ses, rqst->rq_iov, + rqst->rq_nvec); #endif if (timeout == CIFS_ASYNC_OP) @@ -812,8 +855,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { - .iov_base = buf, - .iov_len = midQ->resp_buf_size + .iov_base = resp_iov->iov_base, + .iov_len = resp_iov->iov_len }; smb311_update_preauth_hash(ses, &iov, 1); } @@ -872,49 +915,6 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, return rc; } -/* Like SendReceive2 but iov[0] does not contain an rfc1002 header */ -int -smb2_send_recv(const unsigned int xid, struct cifs_ses *ses, - struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, - const int flags, struct kvec *resp_iov) -{ - struct smb_rqst rqst; - struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; - int rc; - int i; - __u32 count; - __be32 rfc1002_marker; - - if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { - new_iov = kmalloc_array(n_vec + 1, sizeof(struct kvec), - GFP_KERNEL); - if (!new_iov) - return -ENOMEM; - } else - new_iov = s_iov; - - /* 1st iov is an RFC1002 Session Message length */ - memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); - - count = 0; - for (i = 1; i < n_vec + 1; i++) - count += new_iov[i].iov_len; - - rfc1002_marker = cpu_to_be32(count); - - new_iov[0].iov_base = &rfc1002_marker; - new_iov[0].iov_len = 4; - - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = new_iov; - rqst.rq_nvec = n_vec + 1; - - rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); - if (n_vec + 1 > CIFS_MAX_IOV_SIZE) - kfree(new_iov); - return rc; -} - int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, diff --git a/fs/dcache.c b/fs/dcache.c index 0e8e5de3c48a..8d2ec4898c2b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -358,14 +358,11 @@ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; - bool hashed = !d_unhashed(dentry); - if (hashed) - raw_write_seqcount_begin(&dentry->d_seq); + raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - if (hashed) - raw_write_seqcount_end(&dentry->d_seq); + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -732,16 +729,16 @@ static inline bool fast_dput(struct dentry *dentry) if (dentry->d_lockref.count > 1) { dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); - return 1; + return true; } - return 0; + return false; } /* * If we weren't the last ref, we're done. */ if (ret) - return 1; + return true; /* * Careful, careful. The reference count went down @@ -770,7 +767,7 @@ static inline bool fast_dput(struct dentry *dentry) /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) - return 1; + return true; /* * Not the fast normal case? Get the lock. We've already decremented @@ -787,7 +784,7 @@ static inline bool fast_dput(struct dentry *dentry) */ if (dentry->d_lockref.count) { spin_unlock(&dentry->d_lock); - return 1; + return true; } /* @@ -796,7 +793,7 @@ static inline bool fast_dput(struct dentry *dentry) * set it to 1. */ dentry->d_lockref.count = 1; - return 0; + return false; } @@ -1892,50 +1889,25 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW; + inode->i_state &= ~I_NEW & ~I_CREATING; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); -/** - * d_instantiate_no_diralias - instantiate a non-aliased dentry - * @entry: dentry to complete - * @inode: inode to attach to this dentry - * - * Fill in inode information in the entry. If a directory alias is found, then - * return an error (and drop inode). Together with d_materialise_unique() this - * guarantees that a directory inode may never have more than one alias. - */ -int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) -{ - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); - - security_d_instantiate(entry, inode); - spin_lock(&inode->i_lock); - if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) { - spin_unlock(&inode->i_lock); - iput(inode); - return -EBUSY; - } - __d_instantiate(entry, inode); - spin_unlock(&inode->i_lock); - - return 0; -} -EXPORT_SYMBOL(d_instantiate_no_diralias); - struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); - if (res) + if (res) { + res->d_flags |= DCACHE_RCUACCESS; d_instantiate(res, root_inode); - else + } else { iput(root_inode); + } } return res; } @@ -2676,33 +2648,6 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_exact_alias); -/** - * dentry_update_name_case - update case insensitive dentry with a new name - * @dentry: dentry to be updated - * @name: new name - * - * Update a case insensitive dentry with new case of name. - * - * dentry must have been returned by d_lookup with name @name. Old and new - * name lengths must match (ie. no d_compare which allows mismatched name - * lengths). - * - * Parent inode i_mutex must be held over d_lookup and into this call (to - * keep renames and concurrent inserts, and readdir(2) away). - */ -void dentry_update_name_case(struct dentry *dentry, const struct qstr *name) -{ - BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); - BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ - - spin_lock(&dentry->d_lock); - write_seqcount_begin(&dentry->d_seq); - memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); - write_seqcount_end(&dentry->d_seq); - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(dentry_update_name_case); - static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 71fccccf317e..8c6ab6c95727 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -86,7 +86,9 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, /* length of the variable name itself: remove GUID and separator */ namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; - uuid_le_to_bin(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); + err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); + if (err) + goto out; if (efivar_variable_is_removable(var->var.VendorGuid, dentry->d_name.name, namelen)) diff --git a/fs/eventfd.c b/fs/eventfd.c index ceb1031f1cac..08d3bd602f73 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,20 +101,14 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static struct wait_queue_head * -eventfd_get_poll_head(struct file *file, __poll_t events) -{ - struct eventfd_ctx *ctx = file->private_data; - - return &ctx->wqh; -} - -static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; + poll_wait(file, &ctx->wqh, wait); + /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait @@ -156,11 +150,11 @@ static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) count = READ_ONCE(ctx->count); if (count > 0) - events |= (EPOLLIN & eventmask); + events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) - events |= (EPOLLOUT & eventmask); + events |= EPOLLOUT; return events; } @@ -311,8 +305,7 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .get_poll_head = eventfd_get_poll_head, - .poll_mask = eventfd_poll_mask, + .poll = eventfd_poll, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ea4436f409fb..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -922,18 +922,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head return 0; } -static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file, - __poll_t eventmask) -{ - struct eventpoll *ep = file->private_data; - return &ep->poll_wait; -} - -static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { struct eventpoll *ep = file->private_data; int depth = 0; + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + /* * Proceed to find out if wanted events are really available inside * the ready list. @@ -972,8 +968,7 @@ static const struct file_operations eventpoll_fops = { .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, - .get_poll_head = ep_eventpoll_get_poll_head, - .poll_mask = ep_eventpoll_poll_mask, + .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..bdd0eacefdf5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,15 +290,15 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; + vma_set_anonymous(vma); if (down_write_killable(&mm->mmap_sem)) { err = -EINTR; goto err_free; } - vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture @@ -311,7 +311,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) @@ -326,7 +325,7 @@ err: up_write(&mm->mmap_sem); err_free: bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return err; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cc40802ddfa8..00e759f05161 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -748,7 +748,6 @@ extern void ext2_free_blocks (struct inode *, unsigned long, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); extern unsigned long ext2_count_dirs (struct super_block *); -extern void ext2_check_blocks_bitmap (struct super_block *); extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh); @@ -771,7 +770,6 @@ extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); -extern void ext2_check_inodes_bitmap (struct super_block *); extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 6484199b35d1..5c3d7b7e4975 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -611,8 +611,7 @@ fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); return ERR_PTR(err); fail: diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 152453a91877..0c26dcc5d850 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -45,8 +45,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) return 0; } inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); return err; } @@ -192,8 +191,7 @@ out: out_fail: inode_dec_link_count(inode); - unlock_new_inode(inode); - iput (inode); + discard_new_inode(inode); goto out; } @@ -261,8 +259,7 @@ out: out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); out_dir: inode_dec_link_count(dir); goto out; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 25ab1274090f..8ff53f8da3bc 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -557,6 +557,9 @@ static int parse_options(char *options, struct super_block *sb, set_opt (opts->s_mount_opt, NO_UID32); break; case Opt_nocheck: + ext2_msg(sb, KERN_WARNING, + "Option nocheck/check=none is deprecated and" + " will be removed in June 2020."); clear_opt (opts->s_mount_opt, CHECK); break; case Opt_debug: @@ -1335,9 +1338,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) new_opts.s_resgid = sbi->s_resgid; spin_unlock(&sbi->s_lock); - /* - * Allow the "check" option to be passed as a remount option. - */ if (!parse_options(data, sb, &new_opts)) return -EINVAL; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index b00481c475cb..aa52d87985aa 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -184,7 +184,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; - int flex_bg = 0; J_ASSERT_BH(bh, buffer_locked(bh)); @@ -207,22 +206,19 @@ static int ext4_init_block_bitmap(struct super_block *sb, start = ext4_group_first_block_no(sb, block_group); - if (ext4_has_feature_flex_bg(sb)) - flex_bg = 1; - /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } @@ -372,6 +368,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, return -EFSCORRUPTED; ext4_lock_group(sb, block_group); + if (buffer_verified(bh)) + goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, desc, bh))) { ext4_unlock_group(sb, block_group); @@ -390,6 +388,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, return -EFSCORRUPTED; } set_buffer_verified(bh); +verified: ext4_unlock_group(sb, block_group); return 0; } @@ -442,7 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) goto verify; } ext4_lock_group(sb, block_group); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Block bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0b127853c584..7c7123f265c2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1114,6 +1114,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1507,11 +1508,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || - ino == EXT4_USR_QUOTA_INO || - ino == EXT4_GRP_QUOTA_INO || - ino == EXT4_BOOT_LOADER_INO || - ino == EXT4_JOURNAL_INO || - ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } @@ -3018,9 +3014,6 @@ extern int ext4_inline_data_fiemap(struct inode *inode, struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); -extern int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed); extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 98fb0c119c68..adf6668b596f 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -91,6 +91,7 @@ struct ext4_extent_header { }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) +#define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0057fe3f248d..8ce6fd5b10dd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -869,6 +869,12 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); + if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { + EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", + depth); + ret = -EFSCORRUPTED; + goto err; + } if (path) { ext4_ext_drop_refs(path); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f525f909b559..f336cbc6e932 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -90,6 +90,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, return -EFSCORRUPTED; ext4_lock_group(sb, block_group); + if (buffer_verified(bh)) + goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8)) { @@ -101,6 +103,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, return -EFSBADCRC; } set_buffer_verified(bh); +verified: ext4_unlock_group(sb, block_group); return 0; } @@ -150,7 +153,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Inode bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); @@ -994,7 +1006,8 @@ got: /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); @@ -1375,7 +1388,10 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, ext4_itable_unused_count(sb, gdp)), sbi->s_inodes_per_block); - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || + ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)) < + EXT4_FIRST_INO(sb)))) { ext4_error(sb, "Something is wrong with group %u: " "used itable blocks: %d; " "itable unused count: %u", diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 285ed1588730..3543fe80a3c4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -437,6 +437,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); + memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || @@ -681,6 +682,10 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto convert; } + ret = ext4_journal_get_write_access(handle, iloc.bh); + if (ret) + goto out; + flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); @@ -709,7 +714,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, out_up_read: up_read(&EXT4_I(inode)->xattr_sem); out: - if (handle) + if (handle && (ret != 1)) ext4_journal_stop(handle); brelse(iloc.bh); return ret; @@ -751,6 +756,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); + mark_inode_dirty(inode); out: return copied; } @@ -886,18 +892,17 @@ retry_journal: flags |= AOP_FLAG_NOFS; if (ret == -ENOSPC) { + ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, flags, fsdata); - ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out; } - page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; @@ -915,6 +920,9 @@ retry_journal: if (ret < 0) goto out_release_page; } + ret = ext4_journal_get_write_access(handle, iloc.bh); + if (ret) + goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; @@ -935,7 +943,6 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int i_size_changed = 0; int ret; ret = ext4_write_inline_data_end(inode, pos, len, copied, page); @@ -953,10 +960,8 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. */ - if (pos+copied > inode->i_size) { + if (pos+copied > inode->i_size) i_size_write(inode, pos+copied); - i_size_changed = 1; - } unlock_page(page); put_page(page); @@ -966,8 +971,7 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed) - mark_inode_dirty(inode); + mark_inode_dirty(inode); return copied; } @@ -1890,42 +1894,6 @@ out: return (error < 0 ? error : 0); } -/* - * Called during xattr set, and if we can sparse space 'needed', - * just create the extent tree evict the data to the outer block. - * - * We use jbd2 instead of page cache to move data to the 1st block - * so that the whole transaction can be committed as a whole and - * the data isn't lost because of the delayed page cache write. - */ -int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed) -{ - int error; - struct ext4_xattr_entry *entry; - struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - - raw_inode = ext4_raw_inode(&iloc); - entry = (struct ext4_xattr_entry *)((void *)raw_inode + - EXT4_I(inode)->i_inline_off); - if (EXT4_XATTR_LEN(entry->e_name_len) + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { - error = -ENOSPC; - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); -out: - brelse(iloc.bh); - return error; -} - int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2ea07efbe016..4efe77286ecd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -402,9 +402,9 @@ static int __check_block_validity(struct inode *inode, const char *func, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, - "lblock %lu mapped to illegal pblock " + "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, - map->m_len); + map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; @@ -1389,9 +1389,10 @@ static int ext4_write_end(struct file *file, loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; + int inline_data = ext4_has_inline_data(inode); trace_ext4_write_end(inode, pos, len, copied); - if (ext4_has_inline_data(inode)) { + if (inline_data) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); if (ret < 0) { @@ -1419,7 +1420,7 @@ static int ext4_write_end(struct file *file, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed) + if (i_size_changed || inline_data) ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && ext4_can_truncate(inode)) @@ -1493,6 +1494,7 @@ static int ext4_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; int size_changed = 0; + int inline_data = ext4_has_inline_data(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); @@ -1500,7 +1502,7 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) { + if (inline_data) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); if (ret < 0) { @@ -1531,7 +1533,7 @@ static int ext4_journalled_write_end(struct file *file, if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); - if (size_changed) { + if (size_changed || inline_data) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -2028,11 +2030,7 @@ static int __ext4_journalled_writepage(struct page *page, } if (inline_data) { - BUFFER_TRACE(inode_bh, "get write access"); - ret = ext4_journal_get_write_access(handle, inode_bh); - - err = ext4_handle_dirty_metadata(handle, inode, inode_bh); - + ret = ext4_mark_inode_dirty(handle, inode); } else { ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, do_journal_get_write_access); @@ -4506,7 +4504,8 @@ static int __ext4_get_inode_loc(struct inode *inode, int inodes_per_block, inode_offset; iloc->bh = NULL; - if (!ext4_valid_inum(sb, inode->i_ino)) + if (inode->i_ino < EXT4_ROOT_INO || + inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6eae2b91aafa..f7ab34088162 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2423,7 +2423,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, * initialize bb_free to be able to skip * empty groups without initialization */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { @@ -2989,7 +2990,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, #endif ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 27b9a76a0dfa..638ad4743477 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -186,11 +186,8 @@ static int kmmpd(void *data) goto exit_thread; } - if (sb_rdonly(sb)) { - ext4_warning(sb, "kmmpd being stopped since filesystem " - "has been remounted as readonly."); - goto exit_thread; - } + if (sb_rdonly(sb)) + break; diff = jiffies - last_update_time; if (diff < mmp_update_interval * HZ) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c4c2201b3aa..b7f7922061be 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -405,6 +405,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) static void ext4_handle_error(struct super_block *sb) { + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + if (sb_rdonly(sb)) return; @@ -740,6 +743,9 @@ __acquires(bitlock) va_end(args); } + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + if (test_opt(sb, ERRORS_CONT)) { ext4_commit_super(sb, 0); return; @@ -1371,7 +1377,8 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, + Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1438,6 +1445,8 @@ static const match_table_t tokens = { {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, + {Opt_warn_on_error, "warn_on_error"}, + {Opt_nowarn_on_error, "nowarn_on_error"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, @@ -1602,6 +1611,8 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, + {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, @@ -2331,6 +2342,7 @@ static int ext4_check_descriptors(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; + ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; @@ -2363,6 +2375,14 @@ static int ext4_check_descriptors(struct super_block *sb, if (!sb_rdonly(sb)) return 0; } + if (block_bitmap >= sb_block + 1 && + block_bitmap <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2377,6 +2397,14 @@ static int ext4_check_descriptors(struct super_block *sb, if (!sb_rdonly(sb)) return 0; } + if (inode_bitmap >= sb_block + 1 && + inode_bitmap <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2391,6 +2419,14 @@ static int ext4_check_descriptors(struct super_block *sb, if (!sb_rdonly(sb)) return 0; } + if (inode_table >= sb_block + 1 && + inode_table <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -3097,6 +3133,9 @@ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; + if (!ext4_has_group_desc_csum(sb)) + return ngroups; + for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) @@ -3742,6 +3781,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) le32_to_cpu(es->s_log_block_size)); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { ext4_msg(sb, KERN_ERR, @@ -3806,6 +3852,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + goto failed_mount; + } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { @@ -3882,13 +3933,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "block size (%d)", clustersize, blocksize); goto failed_mount; } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; - } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = @@ -3909,10 +3953,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } else { if (clustersize != blocksize) { - ext4_warning(sb, "fragment/cluster size (%d) != " - "block size (%d)", clustersize, - blocksize); - clustersize = blocksize; + ext4_msg(sb, KERN_ERR, + "fragment/cluster size (%d) != " + "block size (%d)", clustersize, blocksize); + goto failed_mount; } if (sbi->s_blocks_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, @@ -3966,6 +4010,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_blocks_count(es)); goto failed_mount; } + if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && + (sbi->s_cluster_ratio == 1)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block is 0 with a 1k block and cluster size"); + goto failed_mount; + } + blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); @@ -4001,6 +4052,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto failed_mount; } + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + ret = -EINVAL; + goto failed_mount; + } bgl_lock_init(sbi->s_blockgroup_lock); @@ -4020,14 +4079,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; } - sbi->s_gdb_count = db_count; - timer_setup(&sbi->s_err_report, print_daily_error_info, 0); /* Register extent status tree shrinker */ @@ -4736,6 +4794,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; + + /* + * The superblock bh should be mapped, but it might not be if the + * device was hot-removed. Not much we can do but fail the I/O. + */ + if (!buffer_mapped(sbh)) + return error; + /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock @@ -5140,6 +5206,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fc4ced59c565..723df14f4084 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -230,12 +230,12 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, { int error = -EFSCORRUPTED; - if (buffer_verified(bh)) - return 0; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) goto errout; + if (buffer_verified(bh)) + return 0; + error = -EFSBADCRC; if (!ext4_xattr_block_csum_verify(inode, bh)) goto errout; @@ -1560,7 +1560,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, handle_t *handle, struct inode *inode, bool is_block) { - struct ext4_xattr_entry *last; + struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; @@ -1595,7 +1595,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* Compute min_offs and last. */ last = s->first; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + for (; !IS_LAST_ENTRY(last); last = next) { + next = EXT4_XATTR_NEXT(last); + if ((void *)next >= s->end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + ret = -EFSCORRUPTED; + goto out; + } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) @@ -2206,23 +2212,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) { - if (error == -ENOSPC && - ext4_has_inline_data(inode)) { - error = ext4_try_to_evict_inline_data(handle, inode, - EXT4_XATTR_LEN(strlen(i->name) + - EXT4_XATTR_SIZE(i->value_len))); - if (error) - return error; - error = ext4_xattr_ibody_find(inode, i, is); - if (error) - return error; - error = ext4_xattr_set_entry(i, s, handle, inode, - false /* is_block */); - } - if (error) - return error; - } + if (error) + return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2651,6 +2642,11 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + /* never move system.data out of the inode */ + if ((last->e_name_len == 4) && + (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && + !memcmp(last->e_name, "data", 4)) + continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 065dc919a0ce..bfd589ea74c0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -707,13 +707,21 @@ static void fat_set_state(struct super_block *sb, brelse(bh); } +static void fat_reset_iocharset(struct fat_mount_options *opts) +{ + if (opts->iocharset != fat_default_iocharset) { + /* Note: opts->iocharset can be NULL here */ + kfree(opts->iocharset); + opts->iocharset = fat_default_iocharset; + } +} + static void delayed_free(struct rcu_head *p) { struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); unload_nls(sbi->nls_disk); unload_nls(sbi->nls_io); - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); + fat_reset_iocharset(&sbi->options); kfree(sbi); } @@ -1132,7 +1140,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->fs_fmask = opts->fs_dmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; - opts->iocharset = fat_default_iocharset; + fat_reset_iocharset(opts); if (is_vfat) { opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; @@ -1289,8 +1297,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, /* vfat specific */ case Opt_charset: - if (opts->iocharset != fat_default_iocharset) - kfree(opts->iocharset); + fat_reset_iocharset(opts); iocharset = match_strdup(&args[0]); if (!iocharset) return -ENOMEM; @@ -1881,8 +1888,7 @@ out_fail: iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); + fat_reset_iocharset(&sbi->options); sb->s_fs_info = NULL; kfree(sbi); return error; diff --git a/fs/file_table.c b/fs/file_table.c index 7ec0b3e5f05d..d6eccd04d703 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -51,6 +51,7 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { + security_file_free(f); percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -100,9 +101,8 @@ int proc_nr_files(struct ctl_table *table, int write, * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ -struct file *get_empty_filp(void) +struct file *alloc_empty_file(int flags, const struct cred *cred) { - const struct cred *cred = current_cred(); static long old_max; struct file *f; int error; @@ -123,11 +123,10 @@ struct file *get_empty_filp(void) if (unlikely(!f)) return ERR_PTR(-ENOMEM); - percpu_counter_inc(&nr_files); f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free(f); + file_free_rcu(&f->f_u.fu_rcuhead); return ERR_PTR(error); } @@ -136,7 +135,10 @@ struct file *get_empty_filp(void) spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); eventpoll_init_file(f); + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ + percpu_counter_inc(&nr_files); return f; over: @@ -152,15 +154,15 @@ over: * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file - * @mode: the mode with which the new file will be opened + * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -struct file *alloc_file(const struct path *path, fmode_t mode, +static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; - file = get_empty_filp(); + file = alloc_empty_file(flags, current_cred()); if (IS_ERR(file)) return file; @@ -168,19 +170,56 @@ struct file *alloc_file(const struct path *path, fmode_t mode, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); - if ((mode & FMODE_READ) && + if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) - mode |= FMODE_CAN_READ; - if ((mode & FMODE_WRITE) && + file->f_mode |= FMODE_CAN_READ; + if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) - mode |= FMODE_CAN_WRITE; - file->f_mode = mode; + file->f_mode |= FMODE_CAN_WRITE; + file->f_mode |= FMODE_OPENED; file->f_op = fop; - if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } -EXPORT_SYMBOL(alloc_file); + +struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, + const char *name, int flags, + const struct file_operations *fops) +{ + static const struct dentry_operations anon_ops = { + .d_dname = simple_dname + }; + struct qstr this = QSTR_INIT(name, strlen(name)); + struct path path; + struct file *file; + + path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); + if (!path.dentry) + return ERR_PTR(-ENOMEM); + if (!mnt->mnt_sb->s_d_op) + d_set_d_op(path.dentry, &anon_ops); + path.mnt = mntget(mnt); + d_instantiate(path.dentry, inode); + file = alloc_file(&path, flags, fops); + if (IS_ERR(file)) { + ihold(inode); + path_put(&path); + } + return file; +} +EXPORT_SYMBOL(alloc_file_pseudo); + +struct file *alloc_file_clone(struct file *base, int flags, + const struct file_operations *fops) +{ + struct file *f = alloc_file(&base->f_path, flags, fops); + if (!IS_ERR(f)) { + path_get(&f->f_path); + f->f_mapping = base->f_mapping; + } + return f; +} /* the real guts of fput() - releasing the last reference to file */ @@ -190,6 +229,9 @@ static void __fput(struct file *file) struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; + if (unlikely(!(file->f_mode & FMODE_OPENED))) + goto out; + might_sleep(); fsnotify_close(file); @@ -207,7 +249,6 @@ static void __fput(struct file *file) } if (file->f_op->release) file->f_op->release(inode, file); - security_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(file->f_mode & FMODE_PATH))) { cdev_put(inode->i_cdev); @@ -220,12 +261,10 @@ static void __fput(struct file *file) put_write_access(inode); __mnt_drop_write(mnt); } - file->f_path.dentry = NULL; - file->f_path.mnt = NULL; - file->f_inode = NULL; - file_free(file); dput(dentry); mntput(mnt); +out: + file_free(file); } static LLIST_HEAD(delayed_fput_list); @@ -300,14 +339,6 @@ void __fput_sync(struct file *file) EXPORT_SYMBOL(fput); -void put_filp(struct file *file) -{ - if (atomic_long_dec_and_test(&file->f_count)) { - security_file_free(file); - file_free(file); - } -} - void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index c184c5a356ff..cdcb376ef8df 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -220,6 +220,7 @@ int fscache_add_cache(struct fscache_cache *cache, { struct fscache_cache_tag *tag; + ASSERTCMP(ifsdef->cookie, ==, &fscache_fsdef_index); BUG_ON(!cache->ops); BUG_ON(!ifsdef); @@ -248,7 +249,6 @@ int fscache_add_cache(struct fscache_cache *cache, if (!cache->kobj) goto error; - ifsdef->cookie = &fscache_fsdef_index; ifsdef->cache = cache; cache->fsdef = ifsdef; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 97137d7ec5ee..83bfe04456b6 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -516,6 +516,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, goto error; } + ASSERTCMP(object->cookie, ==, cookie); fscache_stat(&fscache_n_object_alloc); object->debug_id = atomic_inc_return(&fscache_object_debug_id); @@ -571,6 +572,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie, _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + ASSERTCMP(object->cookie, ==, cookie); + spin_lock(&cookie->lock); /* there may be multiple initial creations of this object, but we only @@ -610,9 +613,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, spin_unlock(&cache->object_list_lock); } - /* attach to the cookie */ - object->cookie = cookie; - fscache_cookie_get(cookie, fscache_cookie_get_attach_object); + /* Attach to the cookie. The object already has a ref on it. */ hlist_add_head(&object->cookie_link, &cookie->backing_objects); fscache_objlist_add(object); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 20e0d0a4dc8c..9edc920f651f 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -327,6 +327,7 @@ void fscache_object_init(struct fscache_object *object, object->store_limit_l = 0; object->cache = cache; object->cookie = cookie; + fscache_cookie_get(cookie, fscache_cookie_get_attach_object); object->parent = NULL; #ifdef CONFIG_FSCACHE_OBJECT_LIST RB_CLEAR_NODE(&object->objlist_link); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e30c5975ea58..8d265790374c 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -70,7 +70,8 @@ void fscache_enqueue_operation(struct fscache_operation *op) ASSERT(op->processor != NULL); ASSERT(fscache_object_is_available(op->object)); ASSERTCMP(atomic_read(&op->usage), >, 0); - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); + ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS, + op->state, ==, FSCACHE_OP_ST_CANCELLED); fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { @@ -499,7 +500,8 @@ void fscache_put_operation(struct fscache_operation *op) struct fscache_cache *cache; _enter("{OBJ%x OP%x,%d}", - op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + op->object ? op->object->debug_id : 0, + op->debug_id, atomic_read(&op->usage)); ASSERTCMP(atomic_read(&op->usage), >, 0); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 56231b31f806..d80aab0d5982 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -399,7 +399,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, - umode_t mode, int *opened) + umode_t mode) { int err; struct inode *inode; @@ -469,7 +469,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_invalidate_attr(dir); - err = finish_open(file, entry, generic_file_open, opened); + err = finish_open(file, entry, generic_file_open); if (err) { fuse_sync_release(ff, flags); } else { @@ -489,7 +489,7 @@ out_err: static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, - umode_t mode, int *opened) + umode_t mode) { int err; struct fuse_conn *fc = get_fuse_conn(dir); @@ -508,12 +508,12 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, goto no_open; /* Only creates */ - *opened |= FILE_CREATED; + file->f_mode |= FMODE_CREATED; if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode, opened); + err = fuse_create_open(dir, entry, file, flags, mode); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -539,6 +539,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, { struct fuse_entry_out outarg; struct inode *inode; + struct dentry *d; int err; struct fuse_forget_link *forget; @@ -570,11 +571,17 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, } kfree(forget); - err = d_instantiate_no_diralias(entry, inode); - if (err) - return err; + d_drop(entry); + d = d_splice_alias(inode, entry); + if (IS_ERR(d)) + return PTR_ERR(d); - fuse_change_entry_timeout(entry, &outarg); + if (d) { + fuse_change_entry_timeout(d, &outarg); + dput(d); + } else { + fuse_change_entry_timeout(entry, &outarg); + } fuse_invalidate_attr(dir); return 0; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index feda55f67050..648f0ca1ad57 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -580,7 +580,7 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct file *file, umode_t mode, dev_t dev, const char *symname, - unsigned int size, int excl, int *opened) + unsigned int size, int excl) { const struct qstr *name = &dentry->d_name; struct posix_acl *default_acl, *acl; @@ -626,7 +626,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = 0; if (file) { if (S_ISREG(inode->i_mode)) - error = finish_open(file, dentry, gfs2_open_common, opened); + error = finish_open(file, dentry, gfs2_open_common); else error = finish_no_open(file, NULL); } @@ -767,8 +767,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); d_instantiate(dentry, inode); if (file) { - *opened |= FILE_CREATED; - error = finish_open(file, dentry, gfs2_open_common, opened); + file->f_mode |= FMODE_CREATED; + error = finish_open(file, dentry, gfs2_open_common); } gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); @@ -822,7 +822,7 @@ fail: static int gfs2_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL); + return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); } /** @@ -830,14 +830,13 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry, * @dir: The directory inode * @dentry: The dentry of the new inode * @file: File to be opened - * @opened: atomic_open flags * * * Returns: errno */ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, - struct file *file, int *opened) + struct file *file) { struct inode *inode; struct dentry *d; @@ -866,7 +865,7 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, return d; } if (file && S_ISREG(inode->i_mode)) - error = finish_open(file, dentry, gfs2_open_common, opened); + error = finish_open(file, dentry, gfs2_open_common); gfs2_glock_dq_uninit(&gh); if (error) { @@ -879,7 +878,7 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { - return __gfs2_lookup(dir, dentry, NULL, NULL); + return __gfs2_lookup(dir, dentry, NULL); } /** @@ -1189,7 +1188,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, if (size >= gfs2_max_stuffed_size(GFS2_I(dir))) return -ENAMETOOLONG; - return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL); + return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); } /** @@ -1204,7 +1203,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); - return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL); + return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); } /** @@ -1219,7 +1218,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL); + return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); } /** @@ -1229,14 +1228,13 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, * @file: The proposed new struct file * @flags: open flags * @mode: File mode - * @opened: Flag to say whether the file has been opened or not * * Returns: error code or 0 for success */ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, - umode_t mode, int *opened) + umode_t mode) { struct dentry *d; bool excl = !!(flags & O_EXCL); @@ -1244,13 +1242,13 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, if (!d_in_lookup(dentry)) goto skip_lookup; - d = __gfs2_lookup(dir, dentry, file, opened); + d = __gfs2_lookup(dir, dentry, file); if (IS_ERR(d)) return PTR_ERR(d); if (d != NULL) dentry = d; if (d_really_is_positive(dentry)) { - if (!(*opened & FILE_OPENED)) + if (!(file->f_mode & FMODE_OPENED)) return finish_no_open(file, d); dput(d); return 0; @@ -1262,7 +1260,7 @@ skip_lookup: if (!(flags & O_CREAT)) return -ENOENT; - return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened); + return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl); } /* diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 2a16111d312f..a2dfa1b2a89c 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -541,7 +541,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, HFS_I(inode)->rsrc_inode = dir; HFS_I(dir)->rsrc_inode = inode; igrab(dir); - hlist_add_fake(&inode->i_hash); + inode_fake_hash(inode); mark_inode_dirty(inode); dont_mount(dentry); out: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2597b290c2a5..444c7b170359 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -610,33 +610,21 @@ static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, int err; inode = hostfs_iget(ino->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + if (IS_ERR(inode)) goto out; - } err = -ENOMEM; name = dentry_name(dentry); - if (name == NULL) - goto out_put; - - err = read_name(inode, name); - - __putname(name); - if (err == -ENOENT) { + if (name) { + err = read_name(inode, name); + __putname(name); + } + if (err) { iput(inode); - inode = NULL; + inode = (err == -ENOENT) ? NULL : ERR_PTR(err); } - else if (err) - goto out_put; - - d_add(dentry, inode); - return NULL; - - out_put: - iput(inode); out: - return ERR_PTR(err); + return d_splice_alias(inode, dentry); } static int hostfs_link(struct dentry *to, struct inode *ino, diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index c83ece7facc5..d85230c84ef2 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -244,6 +244,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in result = iget_locked(dir->i_sb, ino); if (!result) { hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode"); + result = ERR_PTR(-ENOMEM); goto bail1; } if (result->i_state & I_NEW) { @@ -266,6 +267,8 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in if (de->has_acl || de->has_xtd_perm) if (!sb_rdonly(dir->i_sb)) { hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); + iput(result); + result = ERR_PTR(-EINVAL); goto bail1; } @@ -301,29 +304,17 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in } } +bail1: hpfs_brelse4(&qbh); /* * Made it. */ - end: - end_add: +end: +end_add: hpfs_unlock(dir->i_sb); - d_add(dentry, result); - return NULL; - - /* - * Didn't. - */ - bail1: - - hpfs_brelse4(&qbh); - - /*bail:*/ - - hpfs_unlock(dir->i_sb); - return ERR_PTR(-ENOENT); + return d_splice_alias(result, dentry); } const struct file_operations hpfs_dir_ops = diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d508c7844681..346a146c7617 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -411,6 +411,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, bool truncate_op = (lend == LLONG_MAX); memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + vma_init(&pseudo_vma, current->mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); next = start; @@ -595,6 +596,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as input to create an allocation policy. */ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + vma_init(&pseudo_vma, mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; @@ -1308,10 +1310,6 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static const struct dentry_operations anon_ops = { - .d_dname = simple_dname -}; - /* * Note that size should be aligned to proper hugepage size in caller side, * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. @@ -1320,19 +1318,18 @@ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags, int page_size_log) { - struct file *file = ERR_PTR(-ENOMEM); struct inode *inode; - struct path path; - struct super_block *sb; - struct qstr quick_string; + struct vfsmount *mnt; int hstate_idx; + struct file *file; hstate_idx = get_hstate_idx(page_size_log); if (hstate_idx < 0) return ERR_PTR(-ENODEV); *user = NULL; - if (!hugetlbfs_vfsmount[hstate_idx]) + mnt = hugetlbfs_vfsmount[hstate_idx]; + if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { @@ -1348,45 +1345,28 @@ struct file *hugetlb_file_setup(const char *name, size_t size, } } - sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; - quick_string.name = name; - quick_string.len = strlen(quick_string.name); - quick_string.hash = 0; - path.dentry = d_alloc_pseudo(sb, &quick_string); - if (!path.dentry) - goto out_shm_unlock; - - d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); file = ERR_PTR(-ENOSPC); - inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) - goto out_dentry; + goto out; if (creat_flags == HUGETLB_SHMFS_INODE) inode->i_flags |= S_PRIVATE; - file = ERR_PTR(-ENOMEM); - if (hugetlb_reserve_pages(inode, 0, - size >> huge_page_shift(hstate_inode(inode)), NULL, - acctflag)) - goto out_inode; - - d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); - file = alloc_file(&path, FMODE_WRITE | FMODE_READ, - &hugetlbfs_file_operations); - if (IS_ERR(file)) - goto out_dentry; /* inode is already attached */ - - return file; + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) + file = ERR_PTR(-ENOMEM); + else + file = alloc_file_pseudo(inode, mnt, name, O_RDWR, + &hugetlbfs_file_operations); + if (!IS_ERR(file)) + return file; -out_inode: iput(inode); -out_dentry: - path_put(&path); -out_shm_unlock: +out: if (*user) { user_shm_unlock(size, *user); *user = NULL; diff --git a/fs/inode.c b/fs/inode.c index 2c300e981796..a06de4454232 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -804,6 +804,10 @@ repeat: __wait_on_freeing_inode(inode); goto repeat; } + if (unlikely(inode->i_state & I_CREATING)) { + spin_unlock(&inode->i_lock); + return ERR_PTR(-ESTALE); + } __iget(inode); spin_unlock(&inode->i_lock); return inode; @@ -831,6 +835,10 @@ repeat: __wait_on_freeing_inode(inode); goto repeat; } + if (unlikely(inode->i_state & I_CREATING)) { + spin_unlock(&inode->i_lock); + return ERR_PTR(-ESTALE); + } __iget(inode); spin_unlock(&inode->i_lock); return inode; @@ -961,13 +969,26 @@ void unlock_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW; + inode->i_state &= ~I_NEW & ~I_CREATING; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); +void discard_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); + iput(inode); +} +EXPORT_SYMBOL(discard_new_inode); + /** * lock_two_nondirectories - take two i_mutexes on non-directory objects * @@ -1029,6 +1050,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + bool creating = inode->i_state & I_CREATING; again: spin_lock(&inode_hash_lock); @@ -1039,6 +1061,8 @@ again: * Use the old inode instead of the preallocated one. */ spin_unlock(&inode_hash_lock); + if (IS_ERR(old)) + return NULL; wait_on_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); @@ -1060,6 +1084,8 @@ again: inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); + if (!creating) + inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); @@ -1094,12 +1120,13 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, struct inode *inode = ilookup5(sb, hashval, test, data); if (!inode) { - struct inode *new = new_inode(sb); + struct inode *new = alloc_inode(sb); if (new) { + new->i_state = 0; inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) - iput(new); + destroy_inode(new); } } return inode; @@ -1128,6 +1155,8 @@ again: inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { + if (IS_ERR(inode)) + return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); @@ -1165,6 +1194,8 @@ again: */ spin_unlock(&inode_hash_lock); destroy_inode(inode); + if (IS_ERR(old)) + return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { @@ -1282,7 +1313,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, inode = find_inode(sb, head, test, data); spin_unlock(&inode_hash_lock); - return inode; + return IS_ERR(inode) ? NULL : inode; } EXPORT_SYMBOL(ilookup5_nowait); @@ -1338,6 +1369,8 @@ again: spin_unlock(&inode_hash_lock); if (inode) { + if (IS_ERR(inode)) + return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); @@ -1421,12 +1454,17 @@ int insert_inode_locked(struct inode *inode) } if (likely(!old)) { spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; + inode->i_state |= I_NEW | I_CREATING; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } + if (unlikely(old->i_state & I_CREATING)) { + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + return -EBUSY; + } __iget(old); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); @@ -1443,7 +1481,10 @@ EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct inode *old = inode_insert5(inode, hashval, test, NULL, data); + struct inode *old; + + inode->i_state |= I_CREATING; + old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { iput(old); @@ -1999,8 +2040,14 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, inode->i_uid = current_fsuid(); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; + + /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; + else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(dir, CAP_FSETID)) + mode &= ~S_ISGID; } else inode->i_gid = current_fsgid(); inode->i_mode = mode; diff --git a/fs/internal.h b/fs/internal.h index 4a18bdbd2214..50a28fc71300 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -95,7 +95,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ -extern struct file *get_empty_filp(void); +extern struct file *alloc_empty_file(int, const struct cred *); /* * super.c @@ -127,9 +127,7 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); -extern int open_check_o_direct(struct file *f); -extern int vfs_open(const struct path *, struct file *, const struct cred *); -extern struct file *filp_clone_open(struct file *); +extern int vfs_open(const struct path *, struct file *); /* * inode.c diff --git a/fs/iomap.c b/fs/iomap.c index 13cdcf33e6c0..530670608fc8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1859,7 +1859,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { struct inode *inode = mapping->host; - loff_t pos = bno >> inode->i_blkbits; + loff_t pos = bno << inode->i_blkbits; unsigned blocksize = i_blocksize(inode); if (filemap_write_and_wait(mapping)) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 51dd68e67b0f..c0b66a7a795b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1361,6 +1361,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) { jbd_lock_bh_state(bh); + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) + pr_err("JBD2: assertion failure: h_type=%u " + "h_line_no=%u block_no=%llu jlist=%u\n", + handle->h_type, handle->h_line_no, + (unsigned long long) bh->b_blocknr, + jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); jbd_unlock_bh_state(bh); @@ -1380,11 +1387,11 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the transaction. This needs to be done * once a transaction -bzzz */ - jh->b_modified = 1; if (handle->h_buffer_credits <= 0) { ret = -ENOSPC; goto out_unlock_bh; } + jh->b_modified = 1; handle->h_buffer_credits--; } diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h index 395c4c0d0f06..1682a87c00b2 100644 --- a/fs/jfs/jfs_dinode.h +++ b/fs/jfs/jfs_dinode.h @@ -115,6 +115,13 @@ struct dinode { dxd_t _dxd; /* 16: */ union { __le32 _rdev; /* 4: */ + /* + * The fast symlink area + * is expected to overflow + * into _inlineea when + * needed (which will clear + * INLINEEA). + */ u8 _fastsymlink[128]; } _u; u8 _inlineea[128]; diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index f36ef68905a7..93e8c590ff5c 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -491,13 +491,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* release the page */ release_metapage(mp); - /* - * __mark_inode_dirty expects inodes to be hashed. Since we don't - * want special inodes in the fileset inode space, we make them - * appear hashed, but do not put on any lists. hlist_del() - * will work fine and require no locking. - */ - hlist_add_fake(&ip->i_hash); + inode_fake_hash(ip); return (ip); } diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 1f26d1910409..9940a1e04cbf 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -87,6 +87,7 @@ struct jfs_inode_info { struct { unchar _unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ + /* _inline may overflow into _inline_ea when needed */ unchar _inline[128]; /* 128: inline symlink */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 5e9b7bb3aabf..4572b7cf183d 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -61,8 +61,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) inode = new_inode(sb); if (!inode) { jfs_warn("ialloc: new_inode returned NULL!"); - rc = -ENOMEM; - goto fail; + return ERR_PTR(-ENOMEM); } jfs_inode = JFS_IP(inode); @@ -70,8 +69,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) rc = diAlloc(parent, S_ISDIR(mode), inode); if (rc) { jfs_warn("ialloc: diAlloc returned %d!", rc); - if (rc == -EIO) - make_bad_inode(inode); goto fail_put; } @@ -141,9 +138,10 @@ fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; clear_nlink(inode); - unlock_new_inode(inode); + discard_new_inode(inode); + return ERR_PTR(rc); + fail_put: iput(inode); -fail: return ERR_PTR(rc); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 56c3fcbfe80e..14528c0ffe63 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -175,8 +175,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, if (rc) { free_ea_wmap(ip); clear_nlink(ip); - unlock_new_inode(ip); - iput(ip); + discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } @@ -309,8 +308,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) if (rc) { free_ea_wmap(ip); clear_nlink(ip); - unlock_new_inode(ip); - iput(ip); + discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } @@ -1054,8 +1052,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, if (rc) { free_ea_wmap(ip); clear_nlink(ip); - unlock_new_inode(ip); - iput(ip); + discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } @@ -1441,8 +1438,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, if (rc) { free_ea_wmap(ip); clear_nlink(ip); - unlock_new_inode(ip); - iput(ip); + discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 1b9264fd54b6..09da5cf14e27 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -581,7 +581,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) inode->i_ino = 0; inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; - hlist_add_fake(&inode->i_hash); + inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); sbi->direct_inode = inode; @@ -967,8 +967,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, - offsetof(struct jfs_inode_info, i_inline), - sizeof_field(struct jfs_inode_info, i_inline), + offsetof(struct jfs_inode_info, i_inline), IDATASIZE, init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index c60f3d32ee91..a6797986b625 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -491,15 +491,17 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (size > PSIZE) { /* * To keep the rest of the code simple. Allocate a - * contiguous buffer to work with + * contiguous buffer to work with. Make the buffer large + * enough to make use of the whole extent. */ - ea_buf->xattr = kmalloc(size, GFP_KERNEL); + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + ea_buf->xattr = kmalloc(ea_buf->max_size, GFP_KERNEL); if (ea_buf->xattr == NULL) return -ENOMEM; ea_buf->flag = EA_MALLOC; - ea_buf->max_size = (size + sb->s_blocksize - 1) & - ~(sb->s_blocksize - 1); if (ea_size == 0) return 0; diff --git a/fs/locks.c b/fs/locks.c index db7b6917d9c5..bc047a7edc47 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -202,10 +202,6 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); * we often hold the flc_lock as well. In certain cases, when reading the fields * protected by this lock, we can skip acquiring it iff we already hold the * flc_lock. - * - * In particular, adding an entry to the fl_block list requires that you hold - * both the flc_lock and the blocked_lock_lock (acquired in that order). - * Deleting an entry from the list however only requires the file_lock_lock. */ static DEFINE_SPINLOCK(blocked_lock_lock); @@ -990,6 +986,7 @@ out: if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); + trace_flock_lock_inode(inode, request, error); return error; } @@ -2072,6 +2069,13 @@ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) return -1; if (IS_REMOTELCK(fl)) return fl->fl_pid; + /* + * If the flock owner process is dead and its pid has been already + * freed, the translation below won't work, but we still want to show + * flock owner pid number in init pidns. + */ + if (ns == &init_pid_ns) + return (pid_t)fl->fl_pid; rcu_read_lock(); pid = find_pid_ns(fl->fl_pid, &init_pid_ns); @@ -2626,12 +2630,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, fl_pid = locks_translate_pid(fl, proc_pidns); /* - * If there isn't a fl_pid don't display who is waiting on - * the lock if we are called from locks_show, or if we are - * called from __show_fd_info - skip lock entirely + * If lock owner is dead (and pid is freed) or not visible in current + * pidns, zero is shown as a pid value. Check lock info from + * init_pid_ns to get saved lock pid value. */ - if (fl_pid == 0) - return; if (fl->fl_file != NULL) inode = locks_inode(fl->fl_file); diff --git a/fs/namei.c b/fs/namei.c index 734cef54fdf8..3cd396277cd3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2028,6 +2028,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) { int err; + if (IS_ERR(name)) + return PTR_ERR(name); while (*name=='/') name++; if (!*name) @@ -2125,12 +2127,15 @@ OK: } } +/* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { const char *s = nd->name->name; if (!*s) flags &= ~LOOKUP_RCU; + if (flags & LOOKUP_RCU) + rcu_read_lock(); nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; @@ -2143,7 +2148,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; nd->m_seq = read_seqbegin(&mount_lock); @@ -2159,21 +2163,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->m_seq = read_seqbegin(&mount_lock); if (*s == '/') { - if (flags & LOOKUP_RCU) - rcu_read_lock(); set_root(nd); if (likely(!nd_jump_root(nd))) return s; - nd->root.mnt = NULL; - rcu_read_unlock(); return ERR_PTR(-ECHILD); } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; - rcu_read_lock(); - do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; @@ -2195,16 +2193,13 @@ static const char *path_init(struct nameidata *nd, unsigned flags) dentry = f.file->f_path.dentry; - if (*s) { - if (!d_can_lookup(dentry)) { - fdput(f); - return ERR_PTR(-ENOTDIR); - } + if (*s && unlikely(!d_can_lookup(dentry))) { + fdput(f); + return ERR_PTR(-ENOTDIR); } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - rcu_read_lock(); nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { @@ -2272,24 +2267,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path const char *s = path_init(nd, flags); int err; - if (IS_ERR(s)) - return PTR_ERR(s); - - if (unlikely(flags & LOOKUP_DOWN)) { + if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); - if (unlikely(err < 0)) { - terminate_walk(nd); - return err; - } + if (unlikely(err < 0)) + s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && ((err = lookup_last(nd)) > 0)) { s = trailing_symlink(nd); - if (IS_ERR(s)) { - err = PTR_ERR(s); - break; - } } if (!err) err = complete_walk(nd); @@ -2336,10 +2322,7 @@ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); - int err; - if (IS_ERR(s)) - return PTR_ERR(s); - err = link_path_walk(s, nd); + int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { @@ -2666,15 +2649,10 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; - if (IS_ERR(s)) - return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) && (err = mountpoint_last(nd)) > 0) { s = trailing_symlink(nd); - if (IS_ERR(s)) { - err = PTR_ERR(s); - break; - } } if (!err) { *path = nd->path; @@ -3027,17 +3005,16 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * - * Returns 1 if the file was looked up only or didn't need creating. The - * caller will need to perform the open themselves. @path will have been - * updated to point to the new dentry. This may be negative. + * If the file was looked up only or didn't need creating, FMODE_OPENED won't + * be set. The caller will need to perform the open themselves. @path will + * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static int atomic_open(struct nameidata *nd, struct dentry *dentry, struct path *path, struct file *file, const struct open_flags *op, - int open_flag, umode_t mode, - int *opened) + int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; @@ -3052,39 +3029,38 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, - open_to_namei_flags(open_flag), - mode, opened); + open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { - /* - * We didn't have the inode before the open, so check open - * permission here. - */ - int acc_mode = op->acc_mode; - if (*opened & FILE_CREATED) { - WARN_ON(!(open_flag & O_CREAT)); - fsnotify_create(dir, dentry); - acc_mode = 0; - } - error = may_open(&file->f_path, acc_mode, open_flag); - if (WARN_ON(error > 0)) - error = -EINVAL; - } else if (error > 0) { - if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { + if (file->f_mode & FMODE_OPENED) { + /* + * We didn't have the inode before the open, so check open + * permission here. + */ + int acc_mode = op->acc_mode; + if (file->f_mode & FMODE_CREATED) { + WARN_ON(!(open_flag & O_CREAT)); + fsnotify_create(dir, dentry); + acc_mode = 0; + } + error = may_open(&file->f_path, acc_mode, open_flag); + if (WARN_ON(error > 0)) + error = -EINVAL; + } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } - if (*opened & FILE_CREATED) + if (file->f_mode & FMODE_CREATED) fsnotify_create(dir, dentry); if (unlikely(d_is_negative(dentry))) { error = -ENOENT; } else { path->dentry = dentry; path->mnt = nd->path.mnt; - return 1; + return 0; } } } @@ -3095,25 +3071,22 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, /* * Look up and maybe create and open the last component. * - * Must be called with i_mutex held on parent. - * - * Returns 0 if the file was successfully atomically created (if necessary) and - * opened. In this case the file will be returned attached to @file. - * - * Returns 1 if the file was not completely opened at this time, though lookups - * and creations will have been performed and the dentry returned in @path will - * be positive upon return if O_CREAT was specified. If O_CREAT wasn't - * specified then a negative dentry may be returned. + * Must be called with parent locked (exclusive in O_CREAT case). * - * An error code is returned otherwise. + * Returns 0 on success, that is, if + * the file was successfully atomically created (if necessary) and opened, or + * the file was not completely opened at this time, though lookups and + * creations were performed. + * These case are distinguished by presence of FMODE_OPENED on file->f_mode. + * In the latter case dentry returned in @path might be negative if O_CREAT + * hadn't been specified. * - * FILE_CREATE will be set in @*opened if the dentry was created and will be - * cleared otherwise prior to returning. + * An error code is returned on failure. */ static int lookup_open(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, - bool got_write, int *opened) + bool got_write) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -3126,7 +3099,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (unlikely(IS_DEADDIR(dir_inode))) return -ENOENT; - *opened &= ~FILE_CREATED; + file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { @@ -3188,7 +3161,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (dir_inode->i_op->atomic_open) { error = atomic_open(nd, dentry, path, file, op, open_flag, - mode, opened); + mode); if (unlikely(error == -ENOENT) && create_error) error = create_error; return error; @@ -3211,7 +3184,7 @@ no_open: /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { - *opened |= FILE_CREATED; + file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; @@ -3230,7 +3203,7 @@ no_open: out_no_open: path->dentry = dentry; path->mnt = nd->path.mnt; - return 1; + return 0; out_dput: dput(dentry); @@ -3241,8 +3214,7 @@ out_dput: * Handle the last step of open() */ static int do_last(struct nameidata *nd, - struct file *file, const struct open_flags *op, - int *opened) + struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; @@ -3308,17 +3280,17 @@ static int do_last(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - error = lookup_open(nd, &path, file, op, got_write, opened); + error = lookup_open(nd, &path, file, op, got_write); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); - if (error <= 0) { - if (error) - goto out; + if (error) + goto out; - if ((*opened & FILE_CREATED) || + if (file->f_mode & FMODE_OPENED) { + if ((file->f_mode & FMODE_CREATED) || !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; @@ -3326,7 +3298,7 @@ static int do_last(struct nameidata *nd, goto opened; } - if (*opened & FILE_CREATED) { + if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; @@ -3395,20 +3367,15 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - error = vfs_open(&nd->path, file, current_cred()); + BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file); if (error) goto out; - *opened |= FILE_OPENED; opened: - error = open_check_o_direct(file); - if (!error) - error = ima_file_check(file, op->acc_mode, *opened); + error = ima_file_check(file, op->acc_mode); if (!error && will_truncate) error = handle_truncate(file); out: - if (unlikely(error) && (*opened & FILE_OPENED)) - fput(file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3458,7 +3425,7 @@ EXPORT_SYMBOL(vfs_tmpfile); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, - struct file *file, int *opened) + struct file *file) { struct dentry *child; struct path path; @@ -3480,12 +3447,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, if (error) goto out2; file->f_path.mnt = path.mnt; - error = finish_open(file, child, NULL, opened); - if (error) - goto out2; - error = open_check_o_direct(file); - if (error) - fput(file); + error = finish_open(file, child, NULL); out2: mnt_drop_write(path.mnt); out: @@ -3499,7 +3461,7 @@ static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); - error = vfs_open(&path, file, current_cred()); + error = vfs_open(&path, file); path_put(&path); } return error; @@ -3508,59 +3470,40 @@ static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { - const char *s; struct file *file; - int opened = 0; int error; - file = get_empty_filp(); + file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; - file->f_flags = op->open_flag; - if (unlikely(file->f_flags & __O_TMPFILE)) { - error = do_tmpfile(nd, flags, op, file, &opened); - goto out2; - } - - if (unlikely(file->f_flags & O_PATH)) { + error = do_tmpfile(nd, flags, op, file); + } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); - if (!error) - opened |= FILE_OPENED; - goto out2; - } - - s = path_init(nd, flags); - if (IS_ERR(s)) { - put_filp(file); - return ERR_CAST(s); - } - while (!(error = link_path_walk(s, nd)) && - (error = do_last(nd, file, op, &opened)) > 0) { - nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - s = trailing_symlink(nd); - if (IS_ERR(s)) { - error = PTR_ERR(s); - break; + } else { + const char *s = path_init(nd, flags); + while (!(error = link_path_walk(s, nd)) && + (error = do_last(nd, file, op)) > 0) { + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + s = trailing_symlink(nd); } + terminate_walk(nd); } - terminate_walk(nd); -out2: - if (!(opened & FILE_OPENED)) { - BUG_ON(!error); - put_filp(file); + if (likely(!error)) { + if (likely(file->f_mode & FMODE_OPENED)) + return file; + WARN_ON(1); + error = -EINVAL; } - if (unlikely(error)) { - if (error == -EOPENSTALE) { - if (flags & LOOKUP_RCU) - error = -ECHILD; - else - error = -ESTALE; - } - file = ERR_PTR(error); + fput(file); + if (error == -EOPENSTALE) { + if (flags & LOOKUP_RCU) + error = -ECHILD; + else + error = -ESTALE; } - return file; + return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, @@ -4712,29 +4655,6 @@ out: return len; } -/* - * A helper for ->readlink(). This should be used *ONLY* for symlinks that - * have ->get_link() not calling nd_jump_link(). Using (or not using) it - * for any given inode is up to filesystem. - */ -static int generic_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - DEFINE_DELAYED_CALL(done); - struct inode *inode = d_inode(dentry); - const char *link = inode->i_link; - int res; - - if (!link) { - link = inode->i_op->get_link(dentry, inode, &done); - if (IS_ERR(link)) - return PTR_ERR(link); - } - res = readlink_copy(buffer, buflen, link); - do_delayed_call(&done); - return res; -} - /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link @@ -4748,6 +4668,9 @@ static int generic_readlink(struct dentry *dentry, char __user *buffer, int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); + DEFINE_DELAYED_CALL(done); + const char *link; + int res; if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) @@ -4761,7 +4684,15 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) spin_unlock(&inode->i_lock); } - return generic_readlink(dentry, buffer, buflen); + link = inode->i_link; + if (!link) { + link = inode->i_op->get_link(dentry, inode, &done); + if (IS_ERR(link)) + return PTR_ERR(link); + } + res = readlink_copy(buffer, buflen, link); + do_delayed_call(&done); + return res; } EXPORT_SYMBOL(vfs_readlink); diff --git a/fs/namespace.c b/fs/namespace.c index 8ddd14806799..bd2f4c68506a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -659,12 +659,21 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); + smp_mb(); // see mntput_no_expire() if (likely(!read_seqretry(&mount_lock, seq))) return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1); return 1; } + lock_mount_hash(); + if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + mnt_add_count(mnt, -1); + unlock_mount_hash(); + return 1; + } + unlock_mount_hash(); + /* caller will mntput() */ return -1; } @@ -1195,12 +1204,27 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { rcu_read_lock(); - mnt_add_count(mnt, -1); - if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); rcu_read_unlock(); return; } lock_mount_hash(); + /* + * make sure that if __legitimize_mnt() has not seen us grab + * mount_lock, we'll see their refcount increment here. + */ + smp_mb(); + mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { rcu_read_unlock(); unlock_mount_hash(); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index bbd0465535eb..f033f3a69a3b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -883,8 +883,10 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { res = nfs_delegation_find_inode_server(server, fhandle); - if (res != ERR_PTR(-ENOENT)) + if (res != ERR_PTR(-ENOENT)) { + rcu_read_unlock(); return res; + } } rcu_read_unlock(); return ERR_PTR(-ENOENT); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7a9c14426855..d7f158c3efc8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1434,12 +1434,11 @@ static int do_open(struct inode *inode, struct file *filp) static int nfs_finish_open(struct nfs_open_context *ctx, struct dentry *dentry, - struct file *file, unsigned open_flags, - int *opened) + struct file *file, unsigned open_flags) { int err; - err = finish_open(file, dentry, do_open, opened); + err = finish_open(file, dentry, do_open); if (err) goto out; if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) @@ -1452,7 +1451,7 @@ out: int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, - umode_t mode, int *opened) + umode_t mode) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; @@ -1461,6 +1460,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct inode *inode; unsigned int lookup_flags = 0; bool switched = false; + int created = 0; int err; /* Expect a negative dentry */ @@ -1521,7 +1521,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, goto out; trace_nfs_atomic_open_enter(dir, ctx, open_flags); - inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created); + if (created) + file->f_mode |= FMODE_CREATED; if (IS_ERR(inode)) { err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); @@ -1546,7 +1548,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, goto out; } - err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); + err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: @@ -1641,6 +1643,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; + struct dentry *d; int error = -EACCES; d_drop(dentry); @@ -1662,10 +1665,12 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, goto out_error; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); - error = PTR_ERR(inode); - if (IS_ERR(inode)) + d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + error = PTR_ERR(d); goto out_error; - d_add(dentry, inode); + } + dput(d); out: dput(parent); return 0; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index d4a07acad598..8f003792ccde 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1243,17 +1243,18 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - ff_layout_read_record_layoutstats_done(task, hdr); - pnfs_read_resend_pnfs(hdr); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_read(hdr); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: goto out_eagain; @@ -1403,6 +1404,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + pnfs_read_resend_pnfs(hdr); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } @@ -1423,12 +1428,14 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - ff_layout_reset_write(hdr, true); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_write(hdr, false); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: return -EAGAIN; @@ -1575,6 +1582,10 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + ff_layout_reset_write(hdr, true); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 137e18abb7e7..51beb6e38c90 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -258,7 +258,7 @@ extern const struct dentry_operations nfs4_dentry_operations; /* dir.c */ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, - unsigned, umode_t, int *); + unsigned, umode_t); /* super.c */ extern struct file_system_type nfs4_fs_type; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed45090e4df6..b790976d3913 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2951,7 +2951,7 @@ static int _nfs4_do_open(struct inode *dir, } } if (opened && opendata->file_created) - *opened |= FILE_CREATED; + *opened = 1; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { *ctx_th = opendata->f_attr.mdsthreshold; @@ -3294,6 +3294,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + struct pnfs_layout_hdr *lo; bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; @@ -3337,6 +3338,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; } + lo = calldata->arg.lr_args ? calldata->arg.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + } + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; @@ -5972,12 +5979,19 @@ static void nfs4_delegreturn_release(void *calldata) static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) { struct nfs4_delegreturndata *d_data; + struct pnfs_layout_hdr *lo; d_data = (struct nfs4_delegreturndata *)data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; + lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + d_data->args.lr_args = NULL; + d_data->res.lr_res = NULL; + } + nfs4_setup_sequence(d_data->res.server->nfs_client, &d_data->args.seq_args, &d_data->res.seq_res, @@ -6452,34 +6466,34 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (data->arg.new_lock && !data->cancelled) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) - break; + goto out_restart; } - if (data->arg.new_lock_owner != 0) { nfs_confirm_seqid(&lsp->ls_seqid, 0); nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid); set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); - goto out_done; - } else if (nfs4_update_lock_stateid(lsp, &data->res.stateid)) - goto out_done; - + } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) + goto out_restart; break; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: if (data->arg.new_lock_owner != 0) { - if (nfs4_stateid_match(&data->arg.open_stateid, + if (!nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) - goto out_done; - } else if (nfs4_stateid_match(&data->arg.lock_stateid, + goto out_restart; + } else if (!nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) - goto out_done; + goto out_restart; } - if (!data->cancelled) - rpc_restart_call_prepare(task); out_done: dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); + return; +out_restart: + if (!data->cancelled) + rpc_restart_call_prepare(task); + goto out_done; } static void nfs4_lock_release(void *calldata) @@ -6488,7 +6502,7 @@ static void nfs4_lock_release(void *calldata) dprintk("%s: begin!\n", __func__); nfs_free_seqid(data->arg.open_seqid); - if (data->cancelled) { + if (data->cancelled && data->rpc_status == 0) { struct rpc_task *task; task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); @@ -8650,6 +8664,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); + nfs4_sequence_free_slot(&lgp->res.seq_res); + switch (nfs4err) { case 0: goto out; @@ -8714,7 +8730,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, goto out; } - nfs4_sequence_free_slot(&lgp->res.seq_res); err = nfs4_handle_exception(server, nfs4err, exception); if (!status) { if (exception->retry) @@ -8786,20 +8801,22 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) if (IS_ERR(task)) return ERR_CAST(task); status = rpc_wait_for_completion_task(task); - if (status == 0) { + if (status != 0) + goto out; + + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (task->tk_status < 0 || lgp->res.layoutp->len == 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; - } - + } else + lseg = pnfs_layout_process(lgp); +out: trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (status == 0 && lgp->res.layoutp->len) - lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8817,6 +8834,8 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) &lrp->args.seq_args, &lrp->res.seq_res, task); + if (!pnfs_layout_is_valid(lrp->args.layout)) + rpc_exit(task, 0); } static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a8f5e6b16749..3fe81424337d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -801,6 +801,11 @@ static inline void nfs4_lgopen_release(struct nfs4_layoutget *lgp) { } +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return false; +} + #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b0555d7d8200..55a099e47ba2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -763,7 +763,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, goto out_nfserr; } - host_err = ima_file_check(file, may_flags, 0); + host_err = ima_file_check(file, may_flags); if (host_err) { fput(file); goto out_nfserr; diff --git a/fs/open.c b/fs/open.c index d0e955b558ad..d98e19239bb7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -724,27 +724,13 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } -int open_check_o_direct(struct file *f) -{ - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } - return 0; -} - static int do_dentry_open(struct file *f, struct inode *inode, - int (*open)(struct inode *, struct file *), - const struct cred *cred) + int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; int error; - f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | - FMODE_PREAD | FMODE_PWRITE; - path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; @@ -753,7 +739,7 @@ static int do_dentry_open(struct file *f, f->f_wb_err = filemap_sample_wb_err(f->f_mapping); if (unlikely(f->f_flags & O_PATH)) { - f->f_mode = FMODE_PATH; + f->f_mode = FMODE_PATH | FMODE_OPENED; f->f_op = &empty_fops; return 0; } @@ -780,7 +766,7 @@ static int do_dentry_open(struct file *f, goto cleanup_all; } - error = security_file_open(f, cred); + error = security_file_open(f); if (error) goto cleanup_all; @@ -788,6 +774,8 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; + /* normally all 3 are set; ->open() can clear them if needed */ + f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { @@ -795,6 +783,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; } + f->f_mode |= FMODE_OPENED; if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && @@ -809,9 +798,16 @@ static int do_dentry_open(struct file *f, file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) + return -EINVAL; + } return 0; cleanup_all: + if (WARN_ON_ONCE(error > 0)) + error = -EINVAL; fops_put(f->f_op); if (f->f_mode & FMODE_WRITER) { put_write_access(inode); @@ -847,19 +843,12 @@ cleanup_file: * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, - int (*open)(struct inode *, struct file *), - int *opened) + int (*open)(struct inode *, struct file *)) { - int error; - BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - error = do_dentry_open(file, d_backing_inode(dentry), open, - current_cred()); - if (!error) - *opened |= FILE_OPENED; - - return error; + return do_dentry_open(file, d_backing_inode(dentry), open); } EXPORT_SYMBOL(finish_open); @@ -874,13 +863,13 @@ EXPORT_SYMBOL(finish_open); * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * - * Returns "1" which must be the return value of ->atomic_open() after having + * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; - return 1; + return 0; } EXPORT_SYMBOL(finish_no_open); @@ -896,8 +885,7 @@ EXPORT_SYMBOL(file_path); * @file: newly allocated file with f_flag initialized * @cred: credentials to use */ -int vfs_open(const struct path *path, struct file *file, - const struct cred *cred) +int vfs_open(const struct path *path, struct file *file) { struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); @@ -905,7 +893,7 @@ int vfs_open(const struct path *path, struct file *file, return PTR_ERR(dentry); file->f_path = *path; - return do_dentry_open(file, d_backing_inode(dentry), NULL, cred); + return do_dentry_open(file, d_backing_inode(dentry), NULL); } struct file *dentry_open(const struct path *path, int flags, @@ -919,19 +907,11 @@ struct file *dentry_open(const struct path *path, int flags, /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); - f = get_empty_filp(); + f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { - f->f_flags = flags; - error = vfs_open(path, f, cred); - if (!error) { - /* from now on we need fput() to dispose of f */ - error = open_check_o_direct(f); - if (error) { - fput(f); - f = ERR_PTR(error); - } - } else { - put_filp(f); + error = vfs_open(path, f); + if (error) { + fput(f); f = ERR_PTR(error); } } @@ -1063,26 +1043,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(file_open_root); -struct file *filp_clone_open(struct file *oldfile) -{ - struct file *file; - int retval; - - file = get_empty_filp(); - if (IS_ERR(file)) - return file; - - file->f_flags = oldfile->f_flags; - retval = vfs_open(&oldfile->f_path, file, oldfile->f_cred); - if (retval) { - put_filp(file); - return ERR_PTR(retval); - } - - return file; -} -EXPORT_SYMBOL(filp_clone_open); - long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; diff --git a/fs/pipe.c b/fs/pipe.c index bb0840e234f3..bdc5d3c0977d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,22 +509,19 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -static struct wait_queue_head * -pipe_get_poll_head(struct file *filp, __poll_t events) -{ - struct pipe_inode_info *pipe = filp->private_data; - - return &pipe->wait; -} - /* No kernel lock held - fine */ -static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +static __poll_t +pipe_poll(struct file *filp, poll_table *wait) { + __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs = pipe->nrbufs; - __poll_t mask = 0; + int nrbufs; + + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ + nrbufs = pipe->nrbufs; + mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -744,54 +741,33 @@ fail_inode: int create_pipe_files(struct file **res, int flags) { - int err; struct inode *inode = get_pipe_inode(); struct file *f; - struct path path; if (!inode) return -ENFILE; - err = -ENOMEM; - path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &empty_name); - if (!path.dentry) - goto err_inode; - path.mnt = mntget(pipe_mnt); - - d_instantiate(path.dentry, inode); - - f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); + f = alloc_file_pseudo(inode, pipe_mnt, "", + O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), + &pipefifo_fops); if (IS_ERR(f)) { - err = PTR_ERR(f); - goto err_dentry; + free_pipe_info(inode->i_pipe); + iput(inode); + return PTR_ERR(f); } - f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->private_data = inode->i_pipe; - res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); + res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), + &pipefifo_fops); if (IS_ERR(res[0])) { - err = PTR_ERR(res[0]); - goto err_file; + put_pipe_info(inode, inode->i_pipe); + fput(f); + return PTR_ERR(res[0]); } - - path_get(&path); res[0]->private_data = inode->i_pipe; - res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); res[1] = f; return 0; - -err_file: - put_filp(f); -err_dentry: - free_pipe_info(inode->i_pipe); - path_put(&path); - return err; - -err_inode: - free_pipe_info(inode->i_pipe); - iput(inode); - return err; } static int __do_pipe_flags(int *fd, struct file **files, int flags) @@ -1023,8 +999,7 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .get_poll_head = pipe_get_poll_head, - .poll_mask = pipe_poll_mask, + .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, diff --git a/fs/proc/base.c b/fs/proc/base.c index b6572944efc3..aaffc0c30216 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -235,6 +235,10 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, if (env_start != arg_end || env_start >= env_end) env_start = env_end = arg_end; + /* .. and limit it to a maximum of one page of slop */ + if (env_end >= arg_end + PAGE_SIZE) + env_end = arg_end + PAGE_SIZE - 1; + /* We're not going to care if "*ppos" has high bits set */ pos = arg_start + *ppos; @@ -254,10 +258,19 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); + long offset; - got = access_remote_vm(mm, pos, page, size, FOLL_ANON); - if (got <= 0) + /* + * Are we already starting past the official end? + * We always include the last byte that is *supposed* + * to be NUL + */ + offset = (pos >= arg_end) ? pos - arg_end + 1 : 0; + + got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON); + if (got <= offset) break; + got -= offset; /* Don't walk past a NUL character once you hit arg_end */ if (pos + got >= arg_end) { @@ -276,12 +289,17 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, n = arg_end - pos - 1; /* Cut off at first NUL after 'n' */ - got = n + strnlen(page+n, got-n); - if (!got) + got = n + strnlen(page+n, offset+got-n); + if (got < offset) break; + got -= offset; + + /* Include the NUL if it existed */ + if (got < size) + got++; } - got -= copy_to_user(buf, page, got); + got -= copy_to_user(buf, page+offset, got); if (unlikely(!got)) { if (!len) len = -EFAULT; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 6ac1c92997ea..bb1c1625b158 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -564,11 +564,20 @@ static int proc_seq_open(struct inode *inode, struct file *file) return seq_open(file, de->seq_ops); } +static int proc_seq_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_release_private(inode, file); + return seq_release(inode, file); +} + static const struct file_operations proc_seq_fops = { .open = proc_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = proc_seq_release, }; struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e9679016271f..dfd73a4616ce 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -831,7 +831,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); SEQ_PUT_DEC(" kB\nSwapPss: ", mss->swap_pss >> PSS_SHIFT); - SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", + mss->pss_locked >> PSS_SHIFT); seq_puts(m, " kB\n"); } if (!rollup_mode) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d88231e3b2be..fc20e06c56ba 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -711,21 +711,18 @@ EXPORT_SYMBOL(dquot_quota_sync); static unsigned long dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct list_head *head; struct dquot *dquot; unsigned long freed = 0; spin_lock(&dq_list_lock); - head = free_dquots.prev; - while (head != &free_dquots && sc->nr_to_scan) { - dquot = list_entry(head, struct dquot, dq_free); + while (!list_empty(&free_dquots) && sc->nr_to_scan) { + dquot = list_first_entry(&free_dquots, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; - head = free_dquots.prev; } spin_unlock(&dq_list_lock); return freed; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 7e288d97adcb..9fed1c05f1f4 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -76,83 +76,99 @@ static char *le_type(struct reiserfs_key *key) } /* %k */ -static void sprintf_le_key(char *buf, struct reiserfs_key *key) +static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key) { if (key) - sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), - le32_to_cpu(key->k_objectid), le_offset(key), - le_type(key)); + return scnprintf(buf, size, "[%d %d %s %s]", + le32_to_cpu(key->k_dir_id), + le32_to_cpu(key->k_objectid), le_offset(key), + le_type(key)); else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } /* %K */ -static void sprintf_cpu_key(char *buf, struct cpu_key *key) +static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key) { if (key) - sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, - key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), - cpu_type(key)); + return scnprintf(buf, size, "[%d %d %s %s]", + key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, + reiserfs_cpu_offset(key), cpu_type(key)); else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } -static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh) +static int scnprintf_de_head(char *buf, size_t size, + struct reiserfs_de_head *deh) { if (deh) - sprintf(buf, - "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", - deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), - deh_location(deh), deh_state(deh)); + return scnprintf(buf, size, + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", + deh_offset(deh), deh_dir_id(deh), + deh_objectid(deh), deh_location(deh), + deh_state(deh)); else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } -static void sprintf_item_head(char *buf, struct item_head *ih) +static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih) { if (ih) { - strcpy(buf, - (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); - sprintf_le_key(buf + strlen(buf), &(ih->ih_key)); - sprintf(buf + strlen(buf), ", item_len %d, item_location %d, " - "free_space(entry_count) %d", - ih_item_len(ih), ih_location(ih), ih_free_space(ih)); + char *p = buf; + char * const end = buf + size; + + p += scnprintf(p, end - p, "%s", + (ih_version(ih) == KEY_FORMAT_3_6) ? + "*3.6* " : "*3.5*"); + + p += scnprintf_le_key(p, end - p, &ih->ih_key); + + p += scnprintf(p, end - p, + ", item_len %d, item_location %d, free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), + ih_free_space(ih)); + return p - buf; } else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } -static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de) +static int scnprintf_direntry(char *buf, size_t size, + struct reiserfs_dir_entry *de) { char name[20]; memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; - sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); + return scnprintf(buf, size, "\"%s\"==>[%d %d]", + name, de->de_dir_id, de->de_objectid); } -static void sprintf_block_head(char *buf, struct buffer_head *bh) +static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh) { - sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ", - B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); + return scnprintf(buf, size, + "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); } -static void sprintf_buffer_head(char *buf, struct buffer_head *bh) +static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh) { - sprintf(buf, - "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", - bh->b_bdev, bh->b_size, - (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), - bh->b_state, bh->b_page, - buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", - buffer_dirty(bh) ? "DIRTY" : "CLEAN", - buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); + return scnprintf(buf, size, + "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bh->b_bdev, bh->b_size, + (unsigned long long)bh->b_blocknr, + atomic_read(&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty(bh) ? "DIRTY" : "CLEAN", + buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); } -static void sprintf_disk_child(char *buf, struct disk_child *dc) +static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc) { - sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), - dc_size(dc)); + return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]", + dc_block_number(dc), dc_size(dc)); } static char *is_there_reiserfs_struct(char *fmt, int *what) @@ -189,55 +205,60 @@ static void prepare_error_buf(const char *fmt, va_list args) char *fmt1 = fmt_buf; char *k; char *p = error_buf; + char * const end = &error_buf[sizeof(error_buf)]; int what; spin_lock(&error_lock); - strcpy(fmt1, fmt); + if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) { + strscpy(error_buf, "format string too long", end - error_buf); + goto out_unlock; + } while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { *k = 0; - p += vsprintf(p, fmt1, args); + p += vscnprintf(p, end - p, fmt1, args); switch (what) { case 'k': - sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); + p += scnprintf_le_key(p, end - p, + va_arg(args, struct reiserfs_key *)); break; case 'K': - sprintf_cpu_key(p, va_arg(args, struct cpu_key *)); + p += scnprintf_cpu_key(p, end - p, + va_arg(args, struct cpu_key *)); break; case 'h': - sprintf_item_head(p, va_arg(args, struct item_head *)); + p += scnprintf_item_head(p, end - p, + va_arg(args, struct item_head *)); break; case 't': - sprintf_direntry(p, - va_arg(args, - struct reiserfs_dir_entry *)); + p += scnprintf_direntry(p, end - p, + va_arg(args, struct reiserfs_dir_entry *)); break; case 'y': - sprintf_disk_child(p, - va_arg(args, struct disk_child *)); + p += scnprintf_disk_child(p, end - p, + va_arg(args, struct disk_child *)); break; case 'z': - sprintf_block_head(p, - va_arg(args, struct buffer_head *)); + p += scnprintf_block_head(p, end - p, + va_arg(args, struct buffer_head *)); break; case 'b': - sprintf_buffer_head(p, - va_arg(args, struct buffer_head *)); + p += scnprintf_buffer_head(p, end - p, + va_arg(args, struct buffer_head *)); break; case 'a': - sprintf_de_head(p, - va_arg(args, - struct reiserfs_de_head *)); + p += scnprintf_de_head(p, end - p, + va_arg(args, struct reiserfs_de_head *)); break; } - p += strlen(p); fmt1 = k + 2; } - vsprintf(p, fmt1, args); + p += vscnprintf(p, end - p, fmt1, args); +out_unlock: spin_unlock(&error_lock); } diff --git a/fs/select.c b/fs/select.c index 317891ff8165..4a6b6e4b21cb 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,29 +34,6 @@ #include <linux/uaccess.h> -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) -{ - if (file->f_op->poll) { - return file->f_op->poll(file, pt); - } else if (file_has_poll_mask(file)) { - unsigned int events = poll_requested_events(pt); - struct wait_queue_head *head; - - if (pt && pt->_qproc) { - head = file->f_op->get_poll_head(file, events); - if (!head) - return DEFAULT_POLLMASK; - if (IS_ERR(head)) - return EPOLLERR; - pt->_qproc(file, head, pt); - } - - return file->f_op->poll_mask(file, events); - } else { - return DEFAULT_POLLMASK; - } -} -EXPORT_SYMBOL_GPL(vfs_poll); /* * Estimate expected accuracy in ns from a timeval. diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2751476e6b6e..f098b9f1c396 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -167,6 +167,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, } if (compressed) { + if (!msblk->stream) + goto read_failure; length = squashfs_decompress(msblk, bh, b, offset, length, output); if (length < 0) diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 23813c078cc9..0839efa720b3 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -350,6 +350,9 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); + if (unlikely(length < 0)) + return -EIO; + while (length) { entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); if (entry->error) { diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 13d80947bf9e..f1c1430ae721 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -194,7 +194,11 @@ static long long read_indexes(struct super_block *sb, int n, } for (i = 0; i < blocks; i++) { - int size = le32_to_cpu(blist[i]); + int size = squashfs_block_size(blist[i]); + if (size < 0) { + err = size; + goto failure; + } block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size); } n -= blocks; @@ -367,7 +371,24 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) sizeof(size)); if (res < 0) return res; - return le32_to_cpu(size); + return squashfs_block_size(size); +} + +void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer, int offset, int avail) +{ + int copied; + void *pageaddr; + + pageaddr = kmap_atomic(page); + copied = squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + copied, 0, PAGE_SIZE - copied); + kunmap_atomic(pageaddr); + + flush_dcache_page(page); + if (copied == avail) + SetPageUptodate(page); + else + SetPageError(page); } /* Copy data into page cache */ @@ -376,7 +397,6 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, { struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - void *pageaddr; int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = page->index & ~mask, end_index = start_index | mask; @@ -402,12 +422,7 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, if (PageUptodate(push_page)) goto skip_page; - pageaddr = kmap_atomic(push_page); - squashfs_copy_data(pageaddr, buffer, offset, avail); - memset(pageaddr + avail, 0, PAGE_SIZE - avail); - kunmap_atomic(pageaddr); - flush_dcache_page(push_page); - SetPageUptodate(push_page); + squashfs_fill_page(push_page, buffer, offset, avail); skip_page: unlock_page(push_page); if (i != page->index) @@ -416,10 +431,9 @@ skip_page: } /* Read datablock stored packed inside a fragment (tail-end packed block) */ -static int squashfs_readpage_fragment(struct page *page) +static int squashfs_readpage_fragment(struct page *page, int expected) { struct inode *inode = page->mapping->host; - struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); @@ -430,23 +444,16 @@ static int squashfs_readpage_fragment(struct page *page) squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); else - squashfs_copy_cache(page, buffer, i_size_read(inode) & - (msblk->block_size - 1), + squashfs_copy_cache(page, buffer, expected, squashfs_i(inode)->fragment_offset); squashfs_cache_put(buffer); return res; } -static int squashfs_readpage_sparse(struct page *page, int index, int file_end) +static int squashfs_readpage_sparse(struct page *page, int expected) { - struct inode *inode = page->mapping->host; - struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int bytes = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - - squashfs_copy_cache(page, NULL, bytes, 0); + squashfs_copy_cache(page, NULL, expected, 0); return 0; } @@ -456,6 +463,9 @@ static int squashfs_readpage(struct file *file, struct page *page) struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; int index = page->index >> (msblk->block_log - PAGE_SHIFT); int file_end = i_size_read(inode) >> msblk->block_log; + int expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; int res; void *pageaddr; @@ -474,11 +484,11 @@ static int squashfs_readpage(struct file *file, struct page *page) goto error_out; if (bsize == 0) - res = squashfs_readpage_sparse(page, index, file_end); + res = squashfs_readpage_sparse(page, expected); else - res = squashfs_readpage_block(page, block, bsize); + res = squashfs_readpage_block(page, block, bsize, expected); } else - res = squashfs_readpage_fragment(page); + res = squashfs_readpage_fragment(page, expected); if (!res) return 0; diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c index f2310d2a2019..a9ba8d96776a 100644 --- a/fs/squashfs/file_cache.c +++ b/fs/squashfs/file_cache.c @@ -20,7 +20,7 @@ #include "squashfs.h" /* Read separately compressed datablock and memcopy into page cache */ -int squashfs_readpage_block(struct page *page, u64 block, int bsize) +int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expected) { struct inode *i = page->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, @@ -31,7 +31,7 @@ int squashfs_readpage_block(struct page *page, u64 block, int bsize) ERROR("Unable to read page, block %llx, size %x\n", block, bsize); else - squashfs_copy_cache(page, buffer, buffer->length, 0); + squashfs_copy_cache(page, buffer, expected, 0); squashfs_cache_put(buffer); return res; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index cb485d8e0e91..80db1b86a27c 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -21,10 +21,11 @@ #include "page_actor.h" static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page); + int pages, struct page **page, int bytes); /* Read separately compressed datablock directly into page cache */ -int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) +int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, + int expected) { struct inode *inode = target_page->mapping->host; @@ -83,7 +84,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) * using an intermediate buffer. */ res = squashfs_read_cache(target_page, block, bsize, pages, - page); + page, expected); if (res < 0) goto mark_errored; @@ -95,6 +96,11 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) if (res < 0) goto mark_errored; + if (res != expected) { + res = -EIO; + goto mark_errored; + } + /* Last page may have trailing bytes not filled */ bytes = res % PAGE_SIZE; if (bytes) { @@ -138,13 +144,12 @@ out: static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page) + int pages, struct page **page, int bytes) { struct inode *i = target_page->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, block, bsize); - int bytes = buffer->length, res = buffer->error, n, offset = 0; - void *pageaddr; + int res = buffer->error, n, offset = 0; if (res) { ERROR("Unable to read page, block %llx, size %x\n", block, @@ -159,12 +164,7 @@ static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, if (page[n] == NULL) continue; - pageaddr = kmap_atomic(page[n]); - squashfs_copy_data(pageaddr, buffer, offset, avail); - memset(pageaddr + avail, 0, PAGE_SIZE - avail); - kunmap_atomic(pageaddr); - flush_dcache_page(page[n]); - SetPageUptodate(page[n]); + squashfs_fill_page(page[n], buffer, offset, avail); unlock_page(page[n]); if (page[n] != target_page) put_page(page[n]); diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index 0ed6edbc5c71..0681feab4a84 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -49,11 +49,16 @@ int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, u64 *fragment_block) { struct squashfs_sb_info *msblk = sb->s_fs_info; - int block = SQUASHFS_FRAGMENT_INDEX(fragment); - int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); - u64 start_block = le64_to_cpu(msblk->fragment_index[block]); + int block, offset, size; struct squashfs_fragment_entry fragment_entry; - int size; + u64 start_block; + + if (fragment >= msblk->fragments) + return -EIO; + block = SQUASHFS_FRAGMENT_INDEX(fragment); + offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); + + start_block = le64_to_cpu(msblk->fragment_index[block]); size = squashfs_read_metadata(sb, &fragment_entry, &start_block, &offset, sizeof(fragment_entry)); @@ -61,9 +66,7 @@ int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, return size; *fragment_block = le64_to_cpu(fragment_entry.start_block); - size = le32_to_cpu(fragment_entry.size); - - return size; + return squashfs_block_size(fragment_entry.size); } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 887d6d270080..f89f8a74c6ce 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -67,11 +67,12 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *, u64, u64, unsigned int); /* file.c */ +void squashfs_fill_page(struct page *, struct squashfs_cache_entry *, int, int); void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, int); /* file_xxx.c */ -extern int squashfs_readpage_block(struct page *, u64, int); +extern int squashfs_readpage_block(struct page *, u64, int, int); /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 24d12fd14177..4e6853f084d0 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -129,6 +129,12 @@ #define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK)) +static inline int squashfs_block_size(__le32 raw) +{ + u32 size = le32_to_cpu(raw); + return (size >> 25) ? -EIO : size; +} + /* * Inode number ops. Inodes consist of a compressed block number, and an * uncompressed offset within that block diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1da565cb50c3..ef69c31947bf 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -75,6 +75,7 @@ struct squashfs_sb_info { unsigned short block_log; long long bytes_used; unsigned int inodes; + unsigned int fragments; int xattr_ids; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 8a73b97217c8..40e657386fa5 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -175,6 +175,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->inode_table = le64_to_cpu(sblk->inode_table_start); msblk->directory_table = le64_to_cpu(sblk->directory_table_start); msblk->inodes = le32_to_cpu(sblk->inodes); + msblk->fragments = le32_to_cpu(sblk->fragments); flags = le16_to_cpu(sblk->flags); TRACE("Found valid superblock on %pg\n", sb->s_bdev); @@ -185,7 +186,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); TRACE("Block size %d\n", msblk->block_size); TRACE("Number of inodes %d\n", msblk->inodes); - TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments)); + TRACE("Number of fragments %d\n", msblk->fragments); TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); @@ -272,7 +273,7 @@ allocate_id_index_table: sb->s_export_op = &squashfs_export_ops; handle_fragments: - fragments = le32_to_cpu(sblk->fragments); + fragments = msblk->fragments; if (fragments == 0) goto check_directory_table; diff --git a/fs/timerfd.c b/fs/timerfd.c index d84a2bee4f82..d69ad801eb80 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -66,7 +66,7 @@ static void timerfd_triggered(struct timerfd_ctx *ctx) spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; ctx->ticks++; - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, EPOLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); } @@ -107,7 +107,7 @@ void timerfd_clock_was_set(void) if (ctx->moffs != moffs) { ctx->moffs = KTIME_MAX; ctx->ticks++; - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, EPOLLIN); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); } @@ -226,20 +226,21 @@ static int timerfd_release(struct inode *inode, struct file *file) kfree_rcu(ctx, rcu); return 0; } - -static struct wait_queue_head *timerfd_get_poll_head(struct file *file, - __poll_t eventmask) + +static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; + __poll_t events = 0; + unsigned long flags; - return &ctx->wqh; -} + poll_wait(file, &ctx->wqh, wait); -static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) -{ - struct timerfd_ctx *ctx = file->private_data; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->ticks) + events |= EPOLLIN; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); - return ctx->ticks ? EPOLLIN : 0; + return events; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -344,7 +345,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg spin_lock_irq(&ctx->wqh.lock); if (!timerfd_canceled(ctx)) { ctx->ticks = ticks; - wake_up_locked(&ctx->wqh); + wake_up_locked_poll(&ctx->wqh, EPOLLIN); } else ret = -ECANCELED; spin_unlock_irq(&ctx->wqh.lock); @@ -363,8 +364,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .get_poll_head = timerfd_get_poll_head, - .poll_mask = timerfd_poll_mask, + .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, @@ -533,8 +533,8 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) } SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, - const struct itimerspec __user *, utmr, - struct itimerspec __user *, otmr) + const struct __kernel_itimerspec __user *, utmr, + struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 new, old; int ret; @@ -550,7 +550,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return ret; } -SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); @@ -559,7 +559,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct compat_itimerspec __user *, utmr, struct compat_itimerspec __user *, otmr) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 1b961b1d9699..fcda0fc97b90 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -533,8 +533,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, udf_write_aext(table, &epos, &eloc, (etype << 30) | elen, 1); } else - udf_delete_aext(table, epos, eloc, - (etype << 30) | elen); + udf_delete_aext(table, epos); } else { alloc_count = 0; } @@ -630,7 +629,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb, if (goal_elen) udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); else - udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); + udf_delete_aext(table, goal_epos); brelse(goal_epos.bh); udf_add_free_space(sb, partition, -1); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 0a98a2369738..d9523013096f 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -141,10 +141,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, fibh->ebh->b_data, sizeof(struct fileIdentDesc) + fibh->soffset); - fi_len = (sizeof(struct fileIdentDesc) + - cfi->lengthFileIdent + - le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3; - + fi_len = udf_dir_entry_len(cfi); *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); fibh->eoffset = fibh->soffset + fi_len; } else { @@ -152,6 +149,9 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, sizeof(struct fileIdentDesc)); } } + /* Got last entry outside of dir size - fs is corrupted! */ + if (*nf_pos > dir->i_size) + return NULL; return fi; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7f39d17352c9..9915a58fbabd 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1147,8 +1147,7 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) - udf_delete_aext(inode, *epos, laarr[i].extLocation, - laarr[i].extLength); + udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { udf_insert_aext(inode, *epos, laarr[i].extLocation, @@ -2176,14 +2175,15 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, return (nelen >> 30); } -int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, - struct kernel_lb_addr eloc, uint32_t elen) +int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; + struct kernel_lb_addr eloc; + uint32_t elen; if (epos.bh) { get_bh(epos.bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c586026508db..58cc2414992b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -351,8 +351,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; int nfidlen; - uint8_t lfi; - uint16_t liu; udf_pblk_t block; struct kernel_lb_addr eloc; uint32_t elen = 0; @@ -383,7 +381,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, namelen = 0; } - nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3; + nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); f_pos = udf_ext0_offset(dir); @@ -424,12 +422,8 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, goto out_err; } - liu = le16_to_cpu(cfi->lengthOfImpUse); - lfi = cfi->lengthFileIdent; - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (((sizeof(struct fileIdentDesc) + - liu + lfi + 3) & ~3) == nfidlen) { + if (udf_dir_entry_len(cfi) == nfidlen) { cfi->descTag.tagSerialNum = cpu_to_le16(1); cfi->fileVersionNum = cpu_to_le16(1); cfi->fileCharacteristics = 0; @@ -608,8 +602,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (unlikely(!fi)) { inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); return err; } cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -700,8 +693,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); goto out; } set_nlink(inode, 2); @@ -719,8 +711,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (!fi) { clear_nlink(inode); mark_inode_dirty(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); goto out; } cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -1047,8 +1038,7 @@ out: out_no_entry: up_write(&iinfo->i_data_sem); inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); goto out; } @@ -1201,9 +1191,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, if (dir_fi) { dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); - udf_update_tag((char *)dir_fi, - (sizeof(struct fileIdentDesc) + - le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3); + udf_update_tag((char *)dir_fi, udf_dir_entry_len(dir_fi)); if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(old_inode); else diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index bae311b59400..84c47dde4d26 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -132,6 +132,12 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, struct fileIdentDesc *, struct udf_fileident_bh *, uint8_t *, uint8_t *); +static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) +{ + return ALIGN(sizeof(struct fileIdentDesc) + + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, + UDF_NAME_PAD); +} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); @@ -167,8 +173,7 @@ extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); -extern int8_t udf_delete_aext(struct inode *, struct extent_position, - struct kernel_lb_addr, uint32_t); +extern int8_t udf_delete_aext(struct inode *, struct extent_position); extern int8_t udf_next_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t *, int); extern int8_t udf_current_aext(struct inode *, struct extent_position *, diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index e1ef0f0a1353..02c0a4be4212 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -343,8 +343,7 @@ cg_found: fail_remove_inode: mutex_unlock(&sbi->s_lock); clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index d5f43ba76c59..9ef40f100415 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -43,8 +43,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) return 0; } inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); return err; } @@ -142,8 +141,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, out_fail: inode_dec_link_count(inode); - unlock_new_inode(inode); - iput(inode); + discard_new_inode(inode); return err; } @@ -198,8 +196,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); - unlock_new_inode(inode); - iput (inode); + discard_new_inode(inode); out_dir: inode_dec_link_count(dir); return err; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 123bf7d516fc..bad9cea37f12 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -222,24 +222,26 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, unsigned long reason) { struct mm_struct *mm = ctx->mm; - pte_t *pte; + pte_t *ptep, pte; bool ret = true; VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); - if (!pte) + ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + + if (!ptep) goto out; ret = false; + pte = huge_ptep_get(ptep); /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. */ - if (huge_pte_none(*pte)) + if (huge_pte_none(pte)) ret = true; - if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; out: return ret; @@ -631,8 +633,10 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) + if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + } up_write(&mm->mmap_sem); userfaultfd_ctx_put(release_new_ctx); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 84db76e0e3e3..fecd187fcf2c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -157,6 +157,7 @@ __xfs_ag_resv_free( error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); resv->ar_reserved = 0; resv->ar_asked = 0; + resv->ar_orig_reserved = 0; if (error) trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, @@ -189,13 +190,34 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; - xfs_extlen_t reserved; + xfs_extlen_t hidden_space; if (used > ask) ask = used; - reserved = ask - used; - error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + switch (type) { + case XFS_AG_RESV_RMAPBT: + /* + * Space taken by the rmapbt is not subtracted from fdblocks + * because the rmapbt lives in the free space. Here we must + * subtract the entire reservation from fdblocks so that we + * always have blocks available for rmapbt expansion. + */ + hidden_space = ask; + break; + case XFS_AG_RESV_METADATA: + /* + * Space taken by all other metadata btrees are accounted + * on-disk as used space. We therefore only hide the space + * that is reserved but not used by the trees. + */ + hidden_space = ask - used; + break; + default: + ASSERT(0); + return -EINVAL; + } + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); @@ -216,7 +238,8 @@ __xfs_ag_resv_init( resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = reserved; + resv->ar_orig_reserved = hidden_space; + resv->ar_reserved = ask - used; trace_xfs_ag_resv_init(pag, type, ask); return 0; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index eef466260d43..75dbdc14c45f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -223,12 +223,13 @@ xfs_alloc_get_rec( error = xfs_btree_get_rec(cur, &rec, stat); if (error || !(*stat)) return error; - if (rec->alloc.ar_blockcount == 0) - goto out_bad_rec; *bno = be32_to_cpu(rec->alloc.ar_startblock); *len = be32_to_cpu(rec->alloc.ar_blockcount); + if (*len == 0) + goto out_bad_rec; + /* check for valid extent range, including overflow */ if (!xfs_verify_agbno(mp, agno, *bno)) goto out_bad_rec; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 01628f0c9a0c..7205268b30bc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5780,6 +5780,32 @@ del_cursor: return error; } +/* Make sure we won't be right-shifting an extent past the maximum bound. */ +int +xfs_bmap_can_insert_extents( + struct xfs_inode *ip, + xfs_fileoff_t off, + xfs_fileoff_t shift) +{ + struct xfs_bmbt_irec got; + int is_empty; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty); + if (!error && !is_empty && got.br_startoff >= off && + ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff) + error = -EINVAL; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + int xfs_bmap_insert_extents( struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 99dddbd0fcc6..9b49ddf99c41 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -227,6 +227,8 @@ int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); +int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, + xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1c5a8aaf2bfc..059bc44c27e8 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -962,6 +962,9 @@ typedef enum xfs_dinode_fmt { XFS_DFORK_DSIZE(dip, mp) : \ XFS_DFORK_ASIZE(dip, mp)) +#define XFS_DFORK_MAXEXT(dip, mp, w) \ + (XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec)) + /* * Return pointers to the data or attribute forks. */ @@ -1526,6 +1529,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTBLOCK_BITLEN 52 #define BMBT_BLOCKCOUNT_BITLEN 21 +#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) + typedef struct xfs_bmbt_rec { __be64 l0, l1; } xfs_bmbt_rec_t; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d38d724534c4..30d1d60f1d46 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -374,6 +374,47 @@ xfs_log_dinode_to_disk( } } +static xfs_failaddr_t +xfs_dinode_verify_fork( + struct xfs_dinode *dip, + struct xfs_mount *mp, + int whichfork) +{ + uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + + switch (XFS_DFORK_FORMAT(dip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISREG(be16_to_cpu(dip->di_mode))) + return __this_address; + if (be64_to_cpu(dip->di_size) > + XFS_DFORK_SIZE(dip, mp, whichfork)) + return __this_address; + } + if (di_nextents) + return __this_address; + break; + case XFS_DINODE_FMT_EXTENTS: + if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork)) + return __this_address; + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_ATTR_FORK) { + if (di_nextents > MAXAEXTNUM) + return __this_address; + } else if (di_nextents > MAXEXTNUM) { + return __this_address; + } + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -441,24 +482,9 @@ xfs_dinode_verify( case S_IFREG: case S_IFLNK: case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (S_ISREG(mode)) - return __this_address; - if (di_size > XFS_DFORK_DSIZE(dip, mp)) - return __this_address; - if (dip->di_nextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK); + if (fa) + return fa; break; case 0: /* Uninitialized inode ok. */ @@ -468,17 +494,9 @@ xfs_dinode_verify( } if (XFS_DFORK_Q(dip)) { - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - if (dip->di_anextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); + if (fa) + return fa; } else { /* * If there is no fork offset, this may be a freshly-made inode @@ -713,7 +731,8 @@ xfs_inode_validate_extsize( if ((hint_flag || inherit_flag) && extsize == 0) return __this_address; - if (!(hint_flag || inherit_flag) && extsize != 0) + /* free inodes get flags set to zero but extsize remains */ + if (mode && !(hint_flag || inherit_flag) && extsize != 0) return __this_address; if (extsize_bytes % blocksize_bytes) @@ -759,7 +778,8 @@ xfs_inode_validate_cowextsize( if (hint_flag && cowextsize == 0) return __this_address; - if (!hint_flag && cowextsize != 0) + /* free inodes get flags set to zero but cowextsize remains */ + if (mode && !hint_flag && cowextsize != 0) return __this_address; if (hint_flag && rt_flag) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 65fc4ed2e9a1..b228c821bae6 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1029,8 +1029,8 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext >= mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents - 1; + if (high_rec->ar_startext > mp->m_sb.sb_rextents) + high_rec->ar_startext = mp->m_sb.sb_rextents; /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c35009a86699..83b1e8c6c18f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -685,12 +685,10 @@ out_unlock_iolock: } /* - * dead simple method of punching delalyed allocation blocks from a range in - * the inode. Walks a block at a time so will be slow, but is only executed in - * rare error cases so the overhead is not critical. This will always punch out - * both the start and end blocks, even if the ranges only partially overlap - * them, so it is up to the caller to ensure that partial blocks are not - * passed in. + * Dead simple method of punching delalyed allocation blocks from a range in + * the inode. This will always punch out both the start and end blocks, even + * if the ranges only partially overlap them, so it is up to the caller to + * ensure that partial blocks are not passed in. */ int xfs_bmap_punch_delalloc_range( @@ -698,63 +696,44 @@ xfs_bmap_punch_delalloc_range( xfs_fileoff_t start_fsb, xfs_fileoff_t length) { - xfs_fileoff_t remaining = length; + struct xfs_ifork *ifp = &ip->i_df; + xfs_fileoff_t end_fsb = start_fsb + length; + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - do { - int done; - xfs_bmbt_irec_t imap; - int nimaps = 1; - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, - XFS_BMAPI_ENTIRE); + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + return 0; - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "Failed delalloc mapping lookup ino %lld fsb %lld.", - ip->i_ino, start_fsb); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_block; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_block; - } - WARN_ON(imap.br_blockcount == 0); + while (got.br_startoff + got.br_blockcount > start_fsb) { + del = got; + xfs_trim_extent(&del, start_fsb, length); /* - * Note: while we initialise the firstblock/dfops pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. + * A delete can push the cursor forward. Step back to the + * previous extent on non-delalloc or extents outside the + * target range. */ - xfs_defer_init(&dfops, &firstblock); - error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &dfops, &done); - if (error) - break; + if (!del.br_blockcount || + !isnullstartblock(del.br_startblock)) { + if (!xfs_iext_prev_extent(ifp, &icur, &got)) + break; + continue; + } - ASSERT(!xfs_defer_has_unfinished_work(&dfops)); -next_block: - start_fsb++; - remaining--; - } while(remaining > 0); + error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, + &got, &del); + if (error || !xfs_iext_get_extent(ifp, &icur, &got)) + break; + } return error; } @@ -1208,7 +1187,22 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + if (error) + return error; + + /* + * If we zeroed right up to EOF and EOF straddles a page boundary we + * must make sure that the post-EOF area is also zeroed because the + * page could be mmap'd and iomap_zero_range doesn't do that for us. + * Writeback of the eof page will do this, albeit clumsily. + */ + if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + (offset + len) & ~PAGE_MASK, LLONG_MAX); + } + + return error; } /* @@ -1404,6 +1398,10 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); + if (error) + return error; + error = xfs_prepare_shift(ip, offset); if (error) return error; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index c34fa9c342f2..c7157bc48bd1 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -513,8 +513,8 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_trans *tp, struct xfs_getfsmap_info *info) { - struct xfs_rtalloc_rec alow; - struct xfs_rtalloc_rec ahigh; + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; int error; xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a7afcad6b711..3f2bd6032cf8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -387,7 +387,7 @@ xfs_reserve_blocks( do { free = percpu_counter_sum(&mp->m_fdblocks) - mp->m_alloc_set_aside; - if (!free) + if (free <= 0) break; delta = request - mp->m_resblks; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7a96c4e0ab5c..5df4de666cc1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3236,7 +3236,6 @@ xfs_iflush_cluster( struct xfs_inode *cip; int nr_found; int clcount = 0; - int bufwasdelwri; int i; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -3360,37 +3359,22 @@ cluster_corrupt_out: * inode buffer and shut down the filesystem. */ rcu_read_unlock(); - /* - * Clean up the buffer. If it was delwri, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); - if (bufwasdelwri) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (bp->b_iodone) { - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - } else { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - } - } - /* - * Unlocks the flush lock + * We'll always have an inode attached to the buffer for completion + * process by the time we are called from xfs_iflush(). Hence we have + * always need to do IO completion processing to abort the inodes + * attached to the buffer. handle them just like the shutdown case in + * xfs_buf_submit(). */ + ASSERT(bp->b_iodone); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); + + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(cip, false); kmem_free(cilist); xfs_perag_put(pag); @@ -3486,12 +3470,17 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: - * see if other inodes can be gathered into this write + * inode clustering: try to gather other inodes into this write + * + * Note: Any error during clustering will result in the filesystem + * being shut down and completion callbacks run on the cluster buffer. + * As we have already flushed and attached this inode to the buffer, + * it has already been aborted and released by xfs_iflush_cluster() and + * so we have no further error handling to do here. */ error = xfs_iflush_cluster(ip, bp); if (error) - goto cluster_corrupt_out; + return error; *bpp = bp; return 0; @@ -3500,12 +3489,8 @@ corrupt_out: if (bp) xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -cluster_corrupt_out: - error = -EFSCORRUPTED; abort_out: - /* - * Unlocks the flush lock - */ + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(ip, false); return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 8a3613d576af..e08a84d9ee72 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -963,12 +963,13 @@ xfs_ilock_for_iomap( unsigned *lockmode) { unsigned mode = XFS_ILOCK_SHARED; + bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) { + if (xfs_is_reflink_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -989,6 +990,7 @@ xfs_ilock_for_iomap( mode = XFS_ILOCK_EXCL; } +relock: if (flags & IOMAP_NOWAIT) { if (!xfs_ilock_nowait(ip, mode)) return -EAGAIN; @@ -996,6 +998,17 @@ xfs_ilock_for_iomap( xfs_ilock(ip, mode); } + /* + * The reflink iflag could have changed since the earlier unlocked + * check, so if we got ILOCK_SHARED for a write and but we're now a + * reflink inode we have to switch to ILOCK_EXCL and relock. + */ + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, mode); + mode = XFS_ILOCK_EXCL; + goto relock; + } + *lockmode = mode; return 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 0fa29f39d658..3a75de777843 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1253,7 +1253,7 @@ xfs_setup_inode( inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ - hlist_add_fake(&inode->i_hash); + inode_fake_hash(inode); inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e040af120b69..524f543c5b82 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -258,7 +258,12 @@ xfs_trans_alloc( if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + /* + * Zero-reservation ("empty") transactions can't modify anything, so + * they're allowed to run while we're frozen. + */ + WARN_ON(resp->tr_logres > 0 && + mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, |