diff options
Diffstat (limited to 'fs')
78 files changed, 886 insertions, 562 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c index cf445dbd5f2e..9de46116c749 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -173,6 +173,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_expiry = TIME64_MAX; + __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags); } else { cell->dns_expiry = ktime_get_real_seconds(); } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 0568fd986821..e432bd27a2e7 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -208,7 +208,7 @@ again: /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; goto again; @@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6b17d3620414..1a4ce07fb406 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -414,7 +414,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; } else { - vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } @@ -546,6 +545,8 @@ void afs_evict_inode(struct inode *inode) #endif afs_put_permits(rcu_access_pointer(vnode->permit_cache)); + key_put(vnode->lock_key); + vnode->lock_key = NULL; _leave(""); } diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index 07bc10f076aa..d443e2bfa094 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -161,3 +161,14 @@ struct yfs_xdr_YFSStoreVolumeStatus { struct yfs_xdr_u64 max_quota; struct yfs_xdr_u64 file_quota; } __packed; + +enum yfs_lock_type { + yfs_LockNone = -1, + yfs_LockRead = 0, + yfs_LockWrite = 1, + yfs_LockExtend = 2, + yfs_LockRelease = 3, + yfs_LockMandatoryRead = 0x100, + yfs_LockMandatoryWrite = 0x101, + yfs_LockMandatoryExtend = 0x102, +}; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a7b44863d502..2c588f9bbbda 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -23,6 +23,7 @@ struct workqueue_struct *afs_async_calls; static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_delete_async_call(struct work_struct *); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); @@ -203,20 +204,26 @@ void afs_put_call(struct afs_call *call) } } +static struct afs_call *afs_get_call(struct afs_call *call, + enum afs_call_trace why) +{ + int u = atomic_inc_return(&call->usage); + + trace_afs_call(call, why, u, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); + return call; +} + /* * Queue the call for actual work. */ static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { - int u = atomic_inc_return(&call->usage); - - trace_afs_call(call, afs_call_trace_work, u, - atomic_read(&call->net->nr_outstanding_calls), - __builtin_return_address(0)); - INIT_WORK(&call->work, call->type->work); + afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); } @@ -398,6 +405,12 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, } } + /* If the call is going to be asynchronous, we need an extra ref for + * the call to hold itself so the caller need not hang on to its ref. + */ + if (call->async) + afs_get_call(call, afs_call_trace_get); + /* create a call */ rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, (unsigned long)call, @@ -438,15 +451,17 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, goto error_do_abort; } - /* at this point, an async call may no longer exist as it may have - * already completed */ - if (call->async) + /* Note that at this point, we may have received the reply or an abort + * - and an asynchronous call may already have completed. + */ + if (call->async) { + afs_put_call(call); return -EINPROGRESS; + } return afs_wait_for_call_to_complete(call, ac); error_do_abort: - call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); @@ -463,8 +478,24 @@ error_do_abort: error_kill_call: if (call->type->done) call->type->done(call); - afs_put_call(call); + + /* We need to dispose of the extra ref we grabbed for an async call. + * The call, however, might be queued on afs_async_calls and we need to + * make sure we don't get any more notifications that might requeue it. + */ + if (call->rxcall) { + rxrpc_kernel_end_call(call->net->socket, call->rxcall); + call->rxcall = NULL; + } + if (call->async) { + if (cancel_work_sync(&call->async_work)) + afs_put_call(call); + afs_put_call(call); + } + ac->error = ret; + call->state = AFS_CALL_COMPLETE; + afs_put_call(call); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 95d0761cdb34..155dc14caef9 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -42,9 +42,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, if (vldb->fs_mask[i] & type_mask) nr_servers++; - slist = kzalloc(sizeof(struct afs_server_list) + - sizeof(struct afs_server_entry) * nr_servers, - GFP_KERNEL); + slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); if (!slist) goto error; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 12658c1363ae..5aa57929e8c2 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -803,7 +803,7 @@ int yfs_fs_create_file(struct afs_fs_cursor *fc, bp = xdr_encode_YFSFid(bp, &vnode->fid); bp = xdr_encode_string(bp, name, namesz); bp = xdr_encode_YFSStoreStatus_mode(bp, mode); - bp = xdr_encode_u32(bp, 0); /* ViceLockType */ + bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); afs_use_fs_server(call, fc->cbi); @@ -167,9 +167,13 @@ struct kioctx { unsigned id; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ struct fsync_iocb { - struct work_struct work; struct file *file; + struct work_struct work; bool datasync; }; @@ -183,8 +187,15 @@ struct poll_iocb { struct work_struct work; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct aio_kiocb { union { + struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; @@ -1060,6 +1071,8 @@ static inline void iocb_put(struct aio_kiocb *iocb) { if (refcount_read(&iocb->ki_refcnt) == 0 || refcount_dec_and_test(&iocb->ki_refcnt)) { + if (iocb->ki_filp) + fput(iocb->ki_filp); percpu_ref_put(&iocb->ki_ctx->reqs); kmem_cache_free(kiocb_cachep, iocb); } @@ -1424,7 +1437,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - fput(kiocb->ki_filp); aio_complete(iocb, res, res2); } @@ -1432,10 +1444,8 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) - return -EBADF; req->ki_complete = aio_complete_rw; + req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) @@ -1450,7 +1460,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - goto out_fput; + return ret; } req->ki_ioprio = iocb->aio_reqprio; @@ -1459,14 +1469,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - goto out_fput; + return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; - -out_fput: - fput(req->ki_filp); - return ret; } static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, @@ -1520,24 +1526,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, if (ret) return ret; file = req->ki_filp; - - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; + return -EBADF; ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } @@ -1554,16 +1555,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, return ret; file = req->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { /* @@ -1581,9 +1580,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } @@ -1593,7 +1589,6 @@ static void aio_fsync_work(struct work_struct *work) int ret; ret = vfs_fsync(req->file, req->datasync); - fput(req->file); aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); } @@ -1604,13 +1599,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, iocb->aio_rw_flags)) return -EINVAL; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (unlikely(!req->file->f_op->fsync)) { - fput(req->file); + if (unlikely(!req->file->f_op->fsync)) return -EINVAL; - } req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); @@ -1620,10 +1610,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) { - struct file *file = iocb->poll.file; - aio_complete(iocb, mangle_poll(mask), 0); - fput(file); } static void aio_poll_complete_work(struct work_struct *work) @@ -1748,9 +1735,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; req->head = NULL; req->woken = false; @@ -1793,10 +1777,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) spin_unlock_irq(&ctx->ctx_lock); out: - if (unlikely(apt.error)) { - fput(req->file); + if (unlikely(apt.error)) return apt.error; - } if (mask) aio_poll_complete(aiocb, mask); @@ -1834,6 +1816,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, if (unlikely(!req)) goto out_put_reqs_available; + req->ki_filp = fget(iocb->aio_fildes); + ret = -EBADF; + if (unlikely(!req->ki_filp)) + goto out_put_req; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index d441244b79df..28d9c2b1b3bb 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb, pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; - dput(dentry); if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; @@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb, complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); + dput(dentry); + return ret; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 0e8ea2d9a2bb..078992eee299 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -266,8 +266,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) } root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); - if (!root) + if (!root) { + ret = -ENOMEM; goto fail_ino; + } pipe = NULL; root->d_fsdata = ino; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index d0078cbb718b..e996174cbfc0 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -14,13 +14,30 @@ #include <linux/err.h> #include <linux/fs.h> +static inline bool spacetab(char c) { return c == ' ' || c == '\t'; } +static inline char *next_non_spacetab(char *first, const char *last) +{ + for (; first <= last; first++) + if (!spacetab(*first)) + return first; + return NULL; +} +static inline char *next_terminator(char *first, const char *last) +{ + for (; first <= last; first++) + if (spacetab(*first) || !*first) + return first; + return NULL; +} + static int load_script(struct linux_binprm *bprm) { const char *i_arg, *i_name; - char *cp; + char *cp, *buf_end; struct file *file; int retval; + /* Not ours to exec if we don't start with "#!". */ if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; @@ -33,23 +50,41 @@ static int load_script(struct linux_binprm *bprm) if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) return -ENOENT; - /* - * This section does the #! interpretation. - * Sorta complicated, but hopefully it will work. -TYT - */ - + /* Release since we are not mapping a binary into memory. */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; - for (cp = bprm->buf+2;; cp++) { - if (cp >= bprm->buf + BINPRM_BUF_SIZE) + /* + * This section handles parsing the #! line into separate + * interpreter path and argument strings. We must be careful + * because bprm->buf is not yet guaranteed to be NUL-terminated + * (though the buffer will have trailing NUL padding when the + * file size was smaller than the buffer size). + * + * We do not want to exec a truncated interpreter path, so either + * we find a newline (which indicates nothing is truncated), or + * we find a space/tab/NUL after the interpreter path (which + * itself may be preceded by spaces/tabs). Truncating the + * arguments is fine: the interpreter can re-read the script to + * parse them on its own. + */ + buf_end = bprm->buf + sizeof(bprm->buf) - 1; + cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n'); + if (!cp) { + cp = next_non_spacetab(bprm->buf + 2, buf_end); + if (!cp) + return -ENOEXEC; /* Entire buf is spaces/tabs */ + /* + * If there is no later space/tab/NUL we must assume the + * interpreter path is truncated. + */ + if (!next_terminator(cp, buf_end)) return -ENOEXEC; - if (!*cp || (*cp == '\n')) - break; + cp = buf_end; } + /* NUL-terminate the buffer and any trailing spaces/tabs. */ *cp = '\0'; - while (cp > bprm->buf) { cp--; if ((*cp == ' ') || (*cp == '\t')) diff --git a/fs/block_dev.c b/fs/block_dev.c index c546cdce77e6..58a4c1217fa8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ @@ -1431,18 +1445,9 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_logical_block_size(bdev); - inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); inode_unlock(bdev->bd_inode); - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); } EXPORT_SYMBOL(bd_set_size); @@ -1519,8 +1524,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + set_init_blocksize(bdev); + } /* * If the device is invalidated, rescan partition @@ -1555,6 +1562,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f64aad613727..5a6c39b44c84 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -968,6 +968,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return 0; } +static struct extent_buffer *alloc_tree_block_no_bg_flush( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent_start, + const struct btrfs_disk_key *disk_key, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *ret; + + /* + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. + * For similar reasons, we also need to delay flushing pending block + * groups when splitting a leaf or node, from one of those trees, since + * we are holding a write lock on it and its parent or when inserting a + * new root node for one of those trees. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root || + root == fs_info->free_space_root) + trans->can_flush_pending_bgs = false; + + ret = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, disk_key, level, + hint, empty_size); + trans->can_flush_pending_bgs = true; + + return ret; +} + /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked @@ -1015,28 +1057,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; - /* - * If we are COWing a node/leaf from the extent, chunk, device or free - * space trees, make sure that we do not finish block group creation of - * pending block groups. We do this to avoid a deadlock. - * COWing can result in allocation of a new chunk, and flushing pending - * block groups (btrfs_create_pending_block_groups()) can be triggered - * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk, device and free space trees, - * therefore we could deadlock with ourselves since we are holding a - * lock on an extent buffer that btrfs_create_pending_block_groups() may - * try to COW later. - */ - if (root == fs_info->extent_root || - root == fs_info->chunk_root || - root == fs_info->dev_root || - root == fs_info->free_space_root) - trans->can_flush_pending_bgs = false; - - cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, - search_start, empty_size); - trans->can_flush_pending_bgs = true; + cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3345,8 +3367,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &lower_key, level, root->node->start, 0); + c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, + root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3475,8 +3497,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, + c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4260,8 +4282,8 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, + l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0a68cf7032f5..7a2a2621f0d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; +struct btrfs_delayed_ref_root; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -786,6 +787,9 @@ enum { * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, }; struct btrfs_fs_info { @@ -2661,6 +2665,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8da2f380d3c0..6a2a2a951705 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1682,6 +1682,8 @@ static int cleaner_kthread(void *arg) while (1) { again = 0; + set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; @@ -1728,6 +1730,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) @@ -4201,6 +4204,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -4265,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (pin_bytes) btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b15afeae16df..d81035b7ea7d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { @@ -2544,7 +2542,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -4954,6 +4952,15 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = may_commit_transaction(fs_info, space_info); break; default: @@ -7188,7 +7195,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43eb4535319d..5c349667c761 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3129,9 +3129,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } @@ -3254,6 +3251,8 @@ void btrfs_add_delayed_iput(struct inode *inode) ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5586ffd1426..0a3f122dd61f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1621,6 +1621,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, flags | SB_RDONLY, device_name, data); if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } @@ -1630,12 +1631,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error < 0) { root = ERR_PTR(error); mntput(mnt_root); + kfree(subvol_name); goto out; } } } if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 127fa1535f58..4ec2b660d014 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -850,14 +850,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_chunk_metadata(trans); - if (lock && should_end_transaction(trans) && - READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { - spin_lock(&info->trans_lock); - if (cur_trans->state == TRANS_STATE_RUNNING) - cur_trans->state = TRANS_STATE_BLOCKED; - spin_unlock(&info->trans_lock); - } - if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) return btrfs_commit_transaction(trans); @@ -1879,6 +1871,21 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) kmem_cache_free(btrfs_trans_handle_cachep, trans); } +/* + * Release reserved delayed ref space of all pending block groups of the + * transaction and remove them from the list + */ +static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } +} + static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* @@ -2270,6 +2277,7 @@ scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); + btrfs_cleanup_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e4f8f88353e..15561926ab32 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -957,11 +957,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, else fs_devices = alloc_fs_devices(disk_super->fsid, NULL); - fs_devices->fsid_change = fsid_change_in_progress; - if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); + fs_devices->fsid_change = fsid_change_in_progress; + mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); diff --git a/fs/buffer.c b/fs/buffer.c index 52d024bfdbc1..48318fb74938 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) struct buffer_head *head; struct page *page; int all_mapped = 1; + static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); @@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */ - if (all_mapped) { - printk("__find_get_block_slow() failed. " - "block=%llu, b_blocknr=%llu\n", - (unsigned long long)block, - (unsigned long long)bh->b_blocknr); - printk("b_state=0x%08lx, b_size=%zu\n", - bh->b_state, bh->b_size); - printk("device %pg blocksize: %d\n", bdev, - 1 << bd_inode->i_blkbits); + ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); + if (all_mapped && __ratelimit(&last_warned)) { + printk("__find_get_block_slow() failed. block=%llu, " + "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " + "device %pg blocksize: %d\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr, + bh->b_state, bh->b_size, bdev, + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94c026bba2c2..bba28a5034ba 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1035,6 +1035,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci) list_del_init(&ci->i_snap_realm_item); ci->i_snap_realm_counter++; ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, realm); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 03f4d24db8fe..9455d3aef0c3 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -3,19 +3,6 @@ * quota.c - CephFS quota * * Copyright (C) 2017-2018 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/statfs.h> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 041c27ea8de1..f74193da0e09 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + if (list_empty(&ci->i_snap_flush_item)) + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 593fb422d0f3..e92a2fee3c57 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -252,6 +252,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, ",ACL"); #endif seq_putc(m, '\n'); + seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "Servers:"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d1f9c2f3f575..7652551a1fc4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.16" +#define CIFS_VERSION "2.17" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e18915415e13..bb54ccf8481c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1549,18 +1549,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) } static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, + bool malformed) { int length; - struct cifs_readdata *rdata = mid->callback_data; length = cifs_discard_remaining_data(server); - dequeue_mid(mid, rdata->result); + dequeue_mid(mid, malformed); mid->resp_buf = server->smallbuf; server->smallbuf = NULL; return length; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + + return __cifs_readv_discard(server, mid, rdata->result); +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1602,12 +1610,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; } + /* set up first two iov for signature check and to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->total_read - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - return cifs_readv_discard(server, mid); + /* normal error on read response */ + return __cifs_readv_discard(server, mid, false); } /* Is there enough to get to the rest of the READ_RSP header? */ @@ -1651,14 +1670,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", - rdata->iov[0].iov_base, server->total_read); - /* how much data is in the response? */ #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 683310f26171..8463c940e0e5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -720,6 +720,21 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } +static inline bool +zero_credits(struct TCP_Server_Info *server) +{ + int val; + + spin_lock(&server->req_lock); + val = server->credits + server->echo_credits + server->oplock_credits; + if (server->in_flight == 0 && val == 0) { + spin_unlock(&server->req_lock); + return true; + } + spin_unlock(&server->req_lock); + return false; +} + static int cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) { @@ -732,6 +747,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); + /* reconnect if no credits and no requests in flight */ + if (zero_credits(server)) { + cifs_reconnect(server); + return -ECONNABORTED; + } + if (server_unresponsive(server)) return -ECONNABORTED; if (cifs_rdma_enabled(server) && server->smbd_conn) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2c7689f3998d..659ce1b92c44 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2696,6 +2696,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { + kvfree(wdata->pages); kfree(wdata); add_credits_and_wake_if(server, credits, 0); break; @@ -2707,6 +2708,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); + kvfree(wdata->pages); kfree(wdata); add_credits_and_wake_if(server, credits, 0); break; @@ -3386,8 +3388,12 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, } rc = cifs_read_allocate_pages(rdata, npages); - if (rc) - goto error; + if (rc) { + kvfree(rdata->pages); + kfree(rdata); + add_credits_and_wake_if(server, credits, 0); + break; + } rdata->tailsz = PAGE_SIZE; } @@ -3407,7 +3413,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (!rdata->cfile->invalidHandle || !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); -error: if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index f14533da3a93..01a76bccdb8d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -293,6 +293,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; + struct cifs_fid fid; + bool no_cached_open = tcon->nohandlecache; *adjust_tz = false; *symlink = false; @@ -301,6 +303,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; + + /* If it is a root and its handle is cached then use it */ + if (!strlen(full_path) && !no_cached_open) { + rc = open_shroot(xid, tcon, &fid); + if (rc) + goto out; + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + close_shroot(&tcon->crfid); + if (rc) + goto out; + move_smb2_info_to_cifs(data, smb2_data); + goto out; + } + if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a9c47541c53..7b8b58fb4d3f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -648,6 +648,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; + if (rsp->sync_hdr.CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cf7eb891804f..6f96e2292856 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -34,6 +34,7 @@ #include "cifs_ioctl.h" #include "smbdirect.h" +/* Change credits for different ops and return the total number of credits */ static int change_conf(struct TCP_Server_Info *server) { @@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server) server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: - return -1; + return 0; case 1: server->echoes = false; server->oplocks = false; - cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -64,14 +63,15 @@ change_conf(struct TCP_Server_Info *server) server->echo_credits = 1; } server->credits -= server->echo_credits + server->oplock_credits; - return 0; + return server->credits + server->echo_credits + server->oplock_credits; } static void smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, const int optype) { - int *val, rc = 0; + int *val, rc = -1; + spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); @@ -101,8 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, } spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (rc) - cifs_reconnect(server); + + if (server->tcpStatus == CifsNeedReconnect) + return; + + switch (rc) { + case -1: + /* change_conf hasn't been executed */ + break; + case 0: + cifs_dbg(VFS, "Possible client or server bug - zero credits\n"); + break; + case 1: + cifs_dbg(VFS, "disabling echoes and oplocks\n"); + break; + case 2: + cifs_dbg(FYI, "disabling oplocks\n"); + break; + default: + cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); + } } static void @@ -136,7 +154,11 @@ smb2_get_credits(struct mid_q_entry *mid) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; - return le16_to_cpu(shdr->CreditRequest); + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) + return le16_to_cpu(shdr->CreditRequest); + + return 0; } static int @@ -165,14 +187,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, scredits = server->credits; /* can deadlock with reopen */ - if (scredits == 1) { + if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; *credits = 0; break; } - /* leave one credit for a possible reopen */ - scredits--; + /* leave some credits for reopen and other ops */ + scredits -= 8; *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); @@ -844,7 +866,9 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, - SMB2_MAX_EA_BUF, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE, &rsp_iov, &buftype, cifs_sb); if (rc) { /* @@ -3189,11 +3213,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, server->ops->is_status_pending(buf, server, 0)) return -1; - rdata->result = server->ops->map_error(buf, false); + /* set up first two iov to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = + min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + + rdata->result = server->ops->map_error(buf, true); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - dequeue_mid(mid, rdata->result); + /* normal error on read response */ + dequeue_mid(mid, false); return 0; } @@ -3266,14 +3302,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->vals->read_rsp_size - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov[0].iov_base, server->vals->read_rsp_size); - length = rdata->copy_into_pages(server, rdata, &iter); kfree(bvec); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 50811a7dc0e0..77b3aaa39b35 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2816,6 +2816,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; struct cifs_ses *ses = tcon->ses; int flags = 0; + bool allocated = false; cifs_dbg(FYI, "Query Info\n"); @@ -2855,14 +2856,21 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, "Error %d allocating memory for acl\n", rc); *dlen = 0; + rc = -ENOMEM; goto qinf_exit; } + allocated = true; } } rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, min_len, *data); + if (rc && allocated) { + kfree(*data); + *data = NULL; + *dlen = 0; + } qinf_exit: SMB2_query_info_free(&rqst); @@ -2916,9 +2924,10 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); @@ -3175,7 +3184,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 1; + unsigned int credits_received = 0; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3214,6 +3223,9 @@ smb2_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(shdr->CreditRequest); + /* fall through */ default: if (rdata->result != -ENODATA) rdata->result = -EIO; @@ -3229,8 +3241,17 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) + if (rdata->result && rdata->result != -ENODATA) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); + trace_smb3_read_err(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, rdata->offset, + rdata->bytes, rdata->result); + } else + trace_smb3_read_done(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->offset, rdata->got_bytes); queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); @@ -3305,13 +3326,11 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); - } else - trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); + trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, + io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length, rc); + } cifs_small_buf_release(buf); return rc; @@ -3355,10 +3374,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); + trace_smb3_read_err(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length, + rc); } - trace_smb3_read_err(rc, xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; } else @@ -3399,7 +3419,7 @@ smb2_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -3427,6 +3447,9 @@ smb2_writev_callback(struct mid_q_entry *mid) case MID_RETRY_NEEDED: wdata->result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + /* fall through */ default: wdata->result = -EIO; break; @@ -3444,8 +3467,17 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->mr = NULL; } #endif - if (wdata->result) + if (wdata->result) { cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + trace_smb3_write_err(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, wdata->result); + } else + trace_smb3_write_done(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + wdata->offset, wdata->bytes); queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); @@ -3587,10 +3619,7 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } else - trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes); + } async_writev_out: cifs_small_buf_release(req); @@ -3816,8 +3845,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; - } - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } else + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } @@ -4412,8 +4441,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - please_key_low = (__u64 *)req->LeaseKey; - please_key_high = (__u64 *)(req->LeaseKey+8); + please_key_low = (__u64 *)lease_key; + please_key_high = (__u64 *)(lease_key+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 7a2d0a2255e6..538e2299805f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,9 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ -#define MAX_SMB2_HDR_SIZE 0x00b0 +/* 52 transform hdr + 64 hdr + 88 create rsp */ +#define SMB2_TRANSFORM_HEADER_SIZE 52 +#define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) @@ -648,6 +649,13 @@ struct smb2_create_req { __u8 Buffer[0]; } __packed; +/* + * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + + * 88 (fixed part of create response) + 520 (path) + 150 (contexts) + + * 2 bytes of padding. + */ +#define MAX_SMB2_CREATE_RESPONSE_SIZE 824 + struct smb2_create_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ @@ -996,6 +1004,11 @@ struct smb2_close_req { __u64 VolatileFileId; /* opaque endianness */ } __packed; +/* + * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) + */ +#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 + struct smb2_close_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* 60 */ @@ -1398,8 +1411,6 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ -#define SMB2_MAX_EA_BUF 65536 - struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __le32 next_entry_offset; __u8 flags; diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c index bd4a546feec1..465483787193 100644 --- a/fs/cifs/trace.c +++ b/fs/cifs/trace.c @@ -3,16 +3,6 @@ * Copyright (C) 2018, Microsoft Corporation. * * Author(s): Steve French <stfrench@microsoft.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index fb049809555f..59be48206932 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -3,16 +3,6 @@ * Copyright (C) 2018, Microsoft Corporation. * * Author(s): Steve French <stfrench@microsoft.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cifs diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 202e0e84efdd..53532bd3f50d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -786,17 +786,8 @@ static void cifs_compound_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->server; - unsigned int optype = mid->optype; - unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) { - if (mid->resp_buf) - credits_received = server->ops->get_credits(mid); - else - cifs_dbg(FYI, "Bad state for cancelled MID\n"); - } - - add_credits(server, credits_received, optype); + add_credits(server, server->ops->get_credits(mid), mid->optype); } static void diff --git a/fs/dcache.c b/fs/dcache.c index 2593153471cf..aac41adf4743 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -119,6 +119,7 @@ struct dentry_stat_t dentry_stat = { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); +static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) @@ -152,11 +153,22 @@ static long get_nr_dentry_unused(void) return sum < 0 ? 0 : sum; } +static long get_nr_dentry_negative(void) +{ + int i; + long sum = 0; + + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_negative, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); + dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -317,6 +329,8 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; + if (dentry->d_flags & DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) @@ -371,6 +385,11 @@ static void dentry_unlink_inode(struct dentry * dentry) * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * + * The per-cpu "nr_dentry_negative" counters are only updated + * when deleted from or added to the per-superblock LRU list, not + * from/to the shrink list. That is to avoid an unneeded dec/inc + * pair when moving from LRU to shrink list in select_collect(). + * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ @@ -380,6 +399,8 @@ static void d_lru_add(struct dentry *dentry) D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } @@ -388,6 +409,8 @@ static void d_lru_del(struct dentry *dentry) D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } @@ -418,6 +441,8 @@ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } @@ -426,6 +451,8 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } @@ -1188,15 +1215,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, */ void shrink_dcache_sb(struct super_block *sb) { - long freed; - do { LIST_HEAD(dispose); - freed = list_lru_walk(&sb->s_dentry_lru, + list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); - - this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } @@ -1820,6 +1843,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); + /* + * Decrement negative dentry count if it was in the LRU list. + */ + if (dentry->d_flags & DCACHE_LRU_LIST) + this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 13b01351dd1c..29c68c5d44d5 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -324,7 +324,7 @@ static struct dentry *failed_creating(struct dentry *dentry) inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); - return NULL; + return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) @@ -347,7 +347,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, dentry = start_creating(name, parent); if (IS_ERR(dentry)) - return NULL; + return dentry; inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -386,7 +386,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -464,7 +465,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -495,7 +497,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -506,7 +509,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) struct inode *inode; if (IS_ERR(dentry)) - return NULL; + return dentry; inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -545,7 +548,7 @@ struct dentry *debugfs_create_automount(const char *name, struct inode *inode; if (IS_ERR(dentry)) - return NULL; + return dentry; inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) @@ -581,8 +584,8 @@ EXPORT_SYMBOL(debugfs_create_automount); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is - * unloaded, you are responsible here.) If an error occurs, %NULL will be - * returned. + * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) + * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -594,12 +597,12 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) - return NULL; + return ERR_PTR(-ENOMEM); dentry = start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); - return NULL; + return dentry; } inode = debugfs_get_inode(dentry->d_sb); @@ -787,6 +790,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; + if (IS_ERR(old_dir)) + return old_dir; + if (IS_ERR(new_dir)) + return new_dir; + if (IS_ERR_OR_NULL(old_dentry)) + return old_dentry; + trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) @@ -820,7 +830,9 @@ exit: if (dentry && !IS_ERR(dentry)) dput(dentry); unlock_rename(new_dir, old_dir); - return NULL; + if (IS_ERR(dentry)) + return dentry; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(debugfs_rename); diff --git a/fs/direct-io.c b/fs/direct-io.c index dbc1a1f080ce..ec2fb6fe6d37 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + loff_t i_size; /* * If there was a memory error and we've overwritten all the @@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> - i_blkbits)) + i_size = i_size_read(dio->inode); + if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) create = 0; } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 82377017130f..d31b6c72b476 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { + (inode->i_mapping->nrpages == 0 && !need_resched())) { spin_unlock(&inode->i_lock); continue; } @@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); + cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 712f00995390..5508baa11bb6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -116,16 +116,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = file_write_and_wait_range(file, start, end); - if (ret) - return ret; - if (!journal) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL - }; - - ret = ext4_write_inode(inode, &wbc); + ret = __generic_file_fsync(file, start, end, datasync); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) @@ -133,6 +125,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } + ret = file_write_and_wait_range(file, start, end); + if (ret) + return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b40168fcc94a..36855c1f8daf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -331,11 +331,22 @@ struct inode_switch_wbs_context { struct work_struct work; }; +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + down_write(&bdi->wb_switch_rwsem); +} + +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + up_write(&bdi->wb_switch_rwsem); +} + static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) bool switched = false; /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be @@ -428,6 +445,8 @@ skip_switch: spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); + up_read(&bdi->wb_switch_rwsem); + if (switched) { wb_wakeup(new_wb); wb_put(old_wb); @@ -468,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; + /* + * Avoid starting new switches while sync_inodes_sb() is in + * progress. Otherwise, if the down_write protected issue path + * blocks heavily, we might end up starting a large number of + * switches which will block on the rwsem. + */ + if (!down_read_trylock(&bdi->wb_switch_rwsem)) + return; + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - return; + goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); @@ -504,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - return; + goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); +out_unlock: + up_read(&bdi->wb_switch_rwsem); } /** @@ -887,6 +917,9 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -2413,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(bdi, &done); + bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5e516a40e7a..809c0f2f9942 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1742,7 +1742,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; @@ -1757,6 +1756,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].offset = offset; req->page_descs[req->num_pages].length = this_num; req->num_pages++; @@ -2077,8 +2077,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); + pipe_lock(pipe); for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); + pipe_unlock(pipe); out: kvfree(bufs); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffaffe18352a..a59c16bd90ac 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1782,7 +1782,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 76baaa6be393..c2d4099429be 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -628,6 +628,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -1162,7 +1163,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); - fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f15b4c57c4bd..78510ab91835 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,7 +28,6 @@ #include "util.h" #include "trans.h" #include "dir.h" -#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5bfaf381921a..b8830fda51e8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 94dcab655bc0..2295042bc625 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,9 +17,7 @@ #include <linux/bio.h> #include <linux/fs.h> #include <linux/list_sort.h> -#include <linux/blkdev.h> -#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -195,6 +193,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio + * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -231,19 +230,20 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @opf: REQ_OP | op_flags + * @op: REQ_OP + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio->bi_opf = opf; + bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; } @@ -304,7 +304,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op); + gfs2_log_submit_bio(biop, op, 0); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -375,184 +375,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } -/** - * gfs2_end_log_read - end I/O callback for reads from the log - * @bio: The bio - * - * Simply unlock the pages in the bio. The main thread will wait on them and - * process them in order as necessary. - */ - -static void gfs2_end_log_read(struct bio *bio) -{ - struct page *page; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); - } - - bio_put(bio); -} - -/** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. - * @jd: The journal descriptor - * @page: The page to look in - * - * Returns: 1 if found, 0 otherwise. - */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct gfs2_log_header_host uninitialized_var(lh); - void *kaddr = kmap_atomic(page); - unsigned int offset; - bool ret = false; - - for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { - if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { - if (lh.lh_sequence > head->lh_sequence) - *head = lh; - else { - ret = true; - break; - } - } - } - kunmap_atomic(kaddr); - return ret; -} - -/** - * gfs2_jhead_process_page - Search/cleanup a page - * @jd: The journal descriptor - * @index: Index of the page to look into - * @done: If set, perform only cleanup, else search and set if found. - * - * Find the page with 'index' in the journal's mapping. Search the page for - * the journal head if requested (cleanup == false). Release refs on the - * page so the page cache can reclaim it (put_page() twice). We grabbed a - * reference on this page two times, first when we did a find_or_create_page() - * to obtain the page to add it to the bio and second when we do a - * find_get_page() here to get the page to wait on while I/O on it is being - * completed. - * This function is also used to free up a page we might've grabbed but not - * used. Maybe we added it to a bio, but not submitted it for I/O. Or we - * submitted the I/O, but we already found the jhead so we only need to drop - * our references to the page. - */ - -static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, - struct gfs2_log_header_host *head, - bool *done) -{ - struct page *page; - - page = find_get_page(jd->jd_inode->i_mapping, index); - wait_on_page_locked(page); - - if (PageError(page)) - *done = true; - - if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, page); - - put_page(page); /* Once for find_get_page */ - put_page(page); /* Once more for find_or_create_page */ -} - -/** - * gfs2_find_jhead - find the head of a log - * @jd: The journal descriptor - * @head: The log descriptor for the head of the log is returned here - * - * Do a search of a journal by reading it in large chunks using bios and find - * the valid log entry with the highest sequence number. (i.e. the log head) - * - * Returns: 0 on success, errno otherwise - */ - -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct address_space *mapping = jd->jd_inode->i_mapping; - struct gfs2_journal_extent *je; - u32 block, read_idx = 0, submit_idx = 0, index = 0; - int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; - int blocks_per_page = 1 << shift, sz, ret = 0; - struct bio *bio = NULL; - struct page *page; - bool done = false; - errseq_t since; - - memset(head, 0, sizeof(*head)); - if (list_empty(&jd->extent_list)) - gfs2_map_journal_extents(sdp, jd); - - since = filemap_sample_wb_err(mapping); - list_for_each_entry(je, &jd->extent_list, list) { - for (block = 0; block < je->blocks; block += blocks_per_page) { - index = (je->lblock + block) >> shift; - - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; - done = true; - goto out; - } - - if (bio) { - sz = bio_add_page(bio, page, PAGE_SIZE, 0); - if (sz == PAGE_SIZE) - goto page_added; - submit_idx = index; - submit_bio(bio); - bio = NULL; - } - - bio = gfs2_log_alloc_bio(sdp, - je->dblock + (index << shift), - gfs2_end_log_read); - bio->bi_opf = REQ_OP_READ; - sz = bio_add_page(bio, page, PAGE_SIZE, 0); - gfs2_assert_warn(sdp, sz == PAGE_SIZE); - -page_added: - if (submit_idx <= read_idx + BIO_MAX_PAGES) { - /* Keep at least one bio in flight */ - continue; - } - - gfs2_jhead_process_page(jd, read_idx++, head, &done); - if (done) - goto out; /* found */ - } - } - -out: - if (bio) - submit_bio(bio); - while (read_idx <= index) - gfs2_jhead_process_page(jd, read_idx++, head, &done); - - if (!ret) - ret = filemap_check_wb_err(mapping, since); - - return ret; -} - static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, u32 ld_length, u32 ld_data1) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 331160fc568b..711c4d89c063 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -30,10 +30,8 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_submit_bio(struct bio **biop, int opf); +extern void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1179763f6370..b041cb8ae383 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -41,7 +41,6 @@ #include "dir.h" #include "meta_io.h" #include "trace_gfs2.h" -#include "lops.h" #define DO 0 #define UNDO 1 diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 7389e445a7a7..2dac43065382 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -182,6 +182,129 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, } /** + * find_good_lh - find a good log header + * @jd: the journal + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ + +static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, + struct gfs2_log_header_host *head) +{ + unsigned int orig_blk = *blk; + int error; + + for (;;) { + error = get_log_header(jd, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd->jd_blocks) + *blk = 0; + + if (*blk == orig_blk) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + unsigned int blk = head->lh_blkno; + struct gfs2_log_header_host lh; + int error; + + for (;;) { + if (++blk == jd->jd_blocks) + blk = 0; + + error = get_log_header(jd, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_log_header_host lh_1, lh_m; + u32 blk_1, blk_2, blk_m; + int error; + + blk_1 = 0; + blk_2 = jd->jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(jd, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(jd, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(jd, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + +/** * foreach_descriptor - go through the active part of the log * @jd: the journal * @start: the first log header in the active region diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 99575ab81202..11d81248be85 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -27,6 +27,8 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); extern int __get_log_header(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 831d7cb5a49c..17a8d3b43990 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1780,9 +1780,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { - n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; + n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d4b11c903971..ca71163ff7cf 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -45,7 +45,6 @@ #include "util.h" #include "sys.h" #include "xattr.h" -#include "lops.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32920a10100e..a7fa037b876b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -859,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; + + /* + * page_private is subpool pointer in hugetlb pages. Transfer to + * new page. PagePrivate is not associated with page_private for + * hugetlb pages and can not be set here as only page_huge_active + * pages can be migrated. + */ + if (page_private(page)) { + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + } + if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else diff --git a/fs/inode.c b/fs/inode.c index 0cd47fe0dbe5..73432e64f874 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* - * Recently referenced inodes and inodes with many attached pages - * get one more pass. - */ - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; diff --git a/fs/iomap.c b/fs/iomap.c index a3088fae567b..897c60215dd1 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -116,6 +116,12 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -132,6 +138,7 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); + put_page(page); kfree(iop); } @@ -569,8 +576,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (page_has_private(page)) { ClearPagePrivate(page); + get_page(newpage); set_page_private(newpage, page_private(page)); set_page_private(page, 0); + put_page(page); SetPagePrivate(newpage); } @@ -1804,6 +1813,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -1823,7 +1833,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->end_io = end_io; dio->error = 0; dio->flags = 0; - dio->wait_for_completion = is_sync_kiocb(iocb); dio->submit.iter = iter; dio->submit.waiter = current; @@ -1878,7 +1887,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; - if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb); if (ret < 0) @@ -1894,7 +1903,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret <= 0) { /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { - dio->wait_for_completion = true; + wait_for_completion = true; ret = 0; } break; @@ -1916,8 +1925,24 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + /* + * We are about to drop our additional submission reference, which + * might be the last reference to the dio. There are three three + * different ways we can progress here: + * + * (a) If this is the last reference we will always complete and free + * the dio ourselves. + * (b) If this is not the last reference, and we serve an asynchronous + * iocb, we must never touch the dio after the decrement, the + * I/O completion handler will complete and free it. + * (c) If this is not the last reference, but we serve a synchronous + * iocb, the I/O completion handler will wake us up on the drop + * of the final reference, and we will complete and free it here + * after we got woken by the I/O completion handler. + */ + dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!dio->wait_for_completion) + if (!wait_for_completion) return -EIOCBQUEUED; for (;;) { @@ -1934,9 +1959,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - ret = iomap_dio_complete(dio); - - return ret; + return iomap_dio_complete(dio); out_free_dio: kfree(dio); diff --git a/fs/namespace.c b/fs/namespace.c index c373c769e0ce..c4e83d94840c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2698,7 +2698,6 @@ static long exact_copy_from_user(void *to, const void __user * from, if (!access_ok(from, n)) return n; - current->kernel_uaccess_faults_ok++; while (n) { if (__get_user(c, f)) { memset(t, 0, n); @@ -2708,7 +2707,6 @@ static long exact_copy_from_user(void *to, const void __user * from, f++; n--; } - current->kernel_uaccess_faults_ok--; return n; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 46d691ba04bc..45b2322e092d 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - ssize_t ret; - if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; -retry: - ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); - if (ret == -EAGAIN) - goto retry; - return ret; + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 3f23b6840547..bf34ddaa2ad7 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -44,6 +44,7 @@ #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> +#include <keys/request_key_auth-type.h> #include <linux/module.h> #include "internal.h" @@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; - struct key_construction *key_cons; + struct key *authkey; struct idmap *idmap; }; @@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = { { Opt_find_err, NULL } }; -static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); @@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, static void nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) { - struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + struct key *authkey = idmap->idmap_upcall_data->authkey; kfree(idmap->idmap_upcall_data); idmap->idmap_upcall_data = NULL; - complete_request_key(cons, ret); + complete_request_key(authkey, ret); + key_put(authkey); } static void @@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) nfs_idmap_complete_pipe_upcall_locked(idmap, ret); } -static int nfs_idmap_legacy_upcall(struct key_construction *cons, - const char *op, - void *aux) +static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; + struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; - struct key *key = cons->key; + struct key *key = rka->target_key; int ret = -ENOKEY; if (!aux) @@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; - data->key_cons = cons; + data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) @@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, out2: kfree(data); out1: - complete_request_key(cons, ret); + complete_request_key(authkey, ret); return ret; } @@ -651,9 +652,10 @@ out: static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { + struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons; + struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; @@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (idmap->idmap_upcall_data == NULL) goto out_noupcall; - cons = idmap->idmap_upcall_data->key_cons; + authkey = idmap->idmap_upcall_data->authkey; + rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ret = nfs_idmap_read_and_verify_message(&im, &idmap->idmap_upcall_data->idmap_msg, - cons->key, cons->authkey); + rka->target_key, authkey); if (ret >= 0) { - key_set_timeout(cons->key, nfs_idmap_cache_timeout); + key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 22ce3c8a2f46..0570391eaa16 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1895,6 +1895,11 @@ static int nfs_parse_devname(const char *dev_name, size_t len; char *end; + if (unlikely(!dev_name || !*dev_name)) { + dfprintk(MOUNT, "NFS: device name not specified\n"); + return -EINVAL; + } + /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5a0bbf917a32..d09c9f878141 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -238,9 +238,9 @@ out: } /* A writeback failed: mark the page as bad, and invalidate the page cache */ -static void nfs_set_pageerror(struct page *page) +static void nfs_set_pageerror(struct address_space *mapping) { - nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); + nfs_zap_mapping(mapping->host, mapping); } /* @@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = 0; + ret = req->wb_context->error; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(req->wb_context->error)) + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_context_set_write_error(req->wb_context, ret); if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - } + } else + ret = -EAGAIN; nfs_redirty_request(req); - ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -993,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - nfs_set_pageerror(req->wb_page); + nfs_set_pageerror(page_file_mapping(req->wb_page)); nfs_context_set_write_error(req->wb_context, hdr->error); goto remove_req; } @@ -1347,7 +1348,8 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page_file_mapping(page)->host; + struct address_space *mapping = page_file_mapping(page); + struct inode *inode = mapping->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1365,7 +1367,7 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) - nfs_set_pageerror(page); + nfs_set_pageerror(mapping); else __set_page_dirty_nobuffers(page); out: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b33f9785b756..72a7681f4046 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1239,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - nn->nfsd4_lease = 45; /* default lease time */ - nn->nfsd4_grace = 45; + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9824e32b2f23..7dc98e14655d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -557,9 +557,11 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, loff_t cloned; cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); + if (cloned < 0) + return nfserrno(cloned); if (count && cloned != count) - cloned = -EINVAL; - return nfserrno(cloned < 0 ? cloned : 0); + return nfserrno(-EINVAL); + return 0; } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 105576daca4a..798f1253141a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -724,8 +724,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) - return -EINVAL; + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { + ret = -EINVAL; + goto fput_and_out; + } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index a5a2fe76568f..b094d3d79354 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -398,8 +398,6 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter loff_t pos = iocb->ki_pos; ssize_t rc = 0; - BUG_ON(iocb->private); - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); orangefs_stats.reads++; @@ -416,8 +414,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite loff_t pos; ssize_t rc; - BUG_ON(iocb->private); - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); inode_lock(file->f_mapping->host); diff --git a/fs/proc/base.c b/fs/proc/base.c index 633a63462573..f5ed9512d193 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1086,10 +1086,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { - pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", - task_pid_nr(p), p->comm, - p->signal->oom_score_adj, oom_adj, - task_pid_nr(task), task->comm); p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ae109429a88..e39bac94dead 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_misc_dentry_ops); + d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); @@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + ent->proc_dops = &proc_misc_dentry_ops; + out: return ent; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5185d7f6a51e..95b14196f284 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,6 +44,7 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d5e0fcb3439e..a7b12435519e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +static const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +static void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; return proc_register(parent, p); @@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; p->write = write; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..85b0ef890b28 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -423,7 +423,7 @@ struct mem_size_stats { }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty) + bool compound, bool young, bool dirty, bool locked) { int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + if (locked) + mss->pss_locked += (u64)size << PSS_SHIFT; return; } for (i = 0; i < nr; i++, page++) { int mapcount = page_mapcount(page); + unsigned long pss = (PAGE_SIZE << PSS_SHIFT); if (mapcount >= 2) { if (dirty || PageDirty(page)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + mss->pss += pss / mapcount; + if (locked) + mss->pss_locked += pss / mapcount; } else { if (dirty || PageDirty(page)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; - mss->pss += PAGE_SIZE << PSS_SHIFT; + mss->pss += pss; + if (locked) + mss->pss_locked += pss; } } } @@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; if (pte_present(*pte)) { @@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page; /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else VM_BUG_ON_PAGE(1, page); - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, } } #endif - /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); - if (vma->vm_flags & VM_LOCKED) - mss->pss_locked += mss->pss; } #define SEQ_PUT_DEC(str, val) \ diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 96f7d32cd184..898c8321b343 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -128,7 +128,6 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, struct pstore_record *record) { struct persistent_ram_zone *prz; - bool update = (record->type == PSTORE_TYPE_DMESG); /* Give up if we never existed or have hit the end. */ if (!przs) @@ -139,7 +138,7 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, return NULL; /* Update old/shadowed buffer. */ - if (update) + if (prz->type == PSTORE_TYPE_DMESG) persistent_ram_save_old(prz); if (!persistent_ram_old_size(prz)) @@ -711,18 +710,15 @@ static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = dev->platform_data; + struct ramoops_platform_data pdata_local; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; if (dev_of_node(dev) && !pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - pr_err("cannot allocate platform data buffer\n"); - err = -ENOMEM; - goto fail_out; - } + pdata = &pdata_local; + memset(pdata, 0, sizeof(*pdata)); err = ramoops_parse_dt(pdev, pdata); if (err < 0) diff --git a/fs/read_write.c b/fs/read_write.c index ff3c5e6f87cf..30df848b7451 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -426,7 +426,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) ssize_t result; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); /* The cast to a user pointer is valid due to the set_fs() */ result = vfs_read(file, (void __user *)buf, count, pos); set_fs(old_fs); @@ -499,7 +499,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t return -EINVAL; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); p = (__force const char __user *)buf; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; @@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count, ssize_t res; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_write(file, (__force const char __user *)buf, count, pos); set_fs(old_fs); diff --git a/fs/splice.c b/fs/splice.c index de2ede048473..ec8b54b5c0d2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -357,7 +357,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec, ssize_t res; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); set_fs(old_fs); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1c8eecfe52b8..6acf1bfa0bfe 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -768,18 +768,23 @@ xrep_findroot_block( if (!uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) goto out; + /* + * Read verifiers can reference b_ops, so we set the pointer + * here. If the verifier fails we'll reset the buffer state + * to what it was before we touched the buffer. + */ + bp->b_ops = fab->buf_ops; fab->buf_ops->verify_read(bp); if (bp->b_error) { + bp->b_ops = NULL; bp->b_error = 0; goto out; } /* * Some read verifiers will (re)set b_ops, so we must be - * careful not to blow away any such assignment. + * careful not to change b_ops after running the verifier. */ - if (!bp->b_ops) - bp->b_ops = fab->buf_ops; } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 338b9d9984e0..d9048bcea49c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -449,6 +449,7 @@ xfs_map_blocks( } wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); return 0; allocate_blocks: @@ -459,6 +460,7 @@ allocate_blocks: ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); return 0; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index eedc5e0156ff..4f5f2ff3f70f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -776,10 +776,26 @@ _xfs_buf_read( } /* + * Set buffer ops on an unchecked buffer and validate it, if possible. + * * If the caller passed in an ops structure and the buffer doesn't have ops * assigned, set the ops and use them to verify the contents. If the contents * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no * recorded errors and is already in XBF_DONE state. + * + * Under normal operations, every in-core buffer must have buffer ops assigned + * to them when the buffer is read in from disk so that we can validate the + * metadata. + * + * However, there are two scenarios where one can encounter in-core buffers + * that don't have buffer ops. The first is during log recovery of buffers on + * a V4 filesystem, though these buffers are purged at the end of recovery. + * + * The other is online repair, which tries to match arbitrary metadata blocks + * with btree types in order to find the root. If online repair doesn't match + * the buffer with /any/ btree type, the buffer remains in memory in DONE state + * with no ops, and a subsequent read_buf call from elsewhere will not set the + * ops. This function helps us fix this situation. */ int xfs_buf_ensure_ops( @@ -1536,8 +1552,7 @@ __xfs_buf_submit( xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); - if (bp->b_flags & XBF_ASYNC) - xfs_buf_ioend(bp); + xfs_buf_ioend(bp); return -EIO; } |