diff options
author | Ingo Molnar <mingo@kernel.org> | 2019-11-25 17:43:15 +0300 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2019-11-25 17:43:15 +0300 |
commit | 83bae01182ea755280adc1c3a24032d63a614ede (patch) | |
tree | f05249057a392e750c0622bbdd3620e19aafd031 /fs | |
parent | cf25e24db61cc9df42c47485a2ec2bff4e9a3692 (diff) | |
parent | 7b8474466ed97be458c825f34a85f2c2b84c3f95 (diff) | |
download | linux-83bae01182ea755280adc1c3a24032d63a614ede.tar.xz |
Merge branch 'timers/urgent' into timers/core, to pick up fix
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/afs/dir.c | 7 | ||||
-rw-r--r-- | fs/aio.c | 10 | ||||
-rw-r--r-- | fs/autofs/expire.c | 5 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 30 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 6 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 21 | ||||
-rw-r--r-- | fs/btrfs/tree-checker.c | 8 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 1 | ||||
-rw-r--r-- | fs/ceph/caps.c | 10 | ||||
-rw-r--r-- | fs/ceph/dir.c | 15 | ||||
-rw-r--r-- | fs/ceph/file.c | 44 | ||||
-rw-r--r-- | fs/ceph/inode.c | 1 | ||||
-rw-r--r-- | fs/ceph/super.c | 11 | ||||
-rw-r--r-- | fs/cifs/smb2pdu.h | 1 | ||||
-rw-r--r-- | fs/configfs/symlink.c | 2 | ||||
-rw-r--r-- | fs/ecryptfs/inode.c | 84 | ||||
-rw-r--r-- | fs/exportfs/expfs.c | 31 | ||||
-rw-r--r-- | fs/fs-writeback.c | 9 | ||||
-rw-r--r-- | fs/io_uring.c | 32 | ||||
-rw-r--r-- | fs/namespace.c | 15 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 134 |
21 files changed, 325 insertions, 152 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index cc12772d0a4d..497f979018c2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -803,7 +803,12 @@ success: continue; if (cookie->inodes[i]) { - afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]), + struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]); + + if (test_bit(AFS_VNODE_UNSET, &iv->flags)) + continue; + + afs_vnode_commit_status(&fc, iv, scb->cb_break, NULL, scb); continue; } @@ -2179,7 +2179,7 @@ SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, #ifdef CONFIG_COMPAT struct __compat_aio_sigset { - compat_sigset_t __user *sigmask; + compat_uptr_t sigmask; compat_size_t sigsetsize; }; @@ -2193,7 +2193,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { - struct __compat_aio_sigset ksig = { NULL, }; + struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; @@ -2204,7 +2204,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); + ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; @@ -2228,7 +2228,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { - struct __compat_aio_sigset ksig = { NULL, }; + struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; @@ -2239,7 +2239,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); + ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 2866fabf497f..91f5787dae7c 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -459,9 +459,10 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, */ how &= ~AUTOFS_EXP_LEAVES; found = should_expire(expired, mnt, timeout, how); - if (!found || found != expired) - /* Something has changed, continue */ + if (found != expired) { // something has changed, continue + dput(found); goto next; + } if (expired != dentry) dput(dentry); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c3f386b7cc0b..015910079e73 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -474,6 +474,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; + u64 i_size; int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -488,7 +489,19 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, i_size_read(inode), end + 1); + /* + * We need to save i_size before now because it could change in between + * us evaluating the size and assigning it. This is because we lock and + * unlock the page in truncate and fallocate, and then modify the i_size + * later on. + * + * The barriers are to emulate READ_ONCE, remove that once i_size_read + * does that for us. + */ + barrier(); + i_size = i_size_read(inode); + barrier(); + actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -9731,6 +9744,18 @@ out_fail: commit_transaction = true; } if (commit_transaction) { + /* + * We may have set commit_transaction when logging the new name + * in the destination root, in which case we left the source + * root context in the list of log contextes. So make sure we + * remove it to avoid invalid memory accesses, since the context + * was allocated in our stack frame. + */ + if (sync_log_root) { + mutex_lock(&root->log_mutex); + list_del_init(&ctx_root.list); + mutex_unlock(&root->log_mutex); + } ret = btrfs_commit_transaction(trans); } else { int ret2; @@ -9744,6 +9769,9 @@ out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + ASSERT(list_empty(&ctx_root.list)); + ASSERT(list_empty(&ctx_dest.list)); + return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7c145a41decd..23272d9154f3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4195,9 +4195,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - btrfs_warn(root->fs_info, - "START_SYNC ioctl is deprecated and will be removed in kernel 5.7"); - trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -4225,9 +4222,6 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, { u64 transid; - btrfs_warn(fs_info, - "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7"); - if (argp) { if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 98dc092a905e..e8a4b0ebe97f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -893,6 +893,15 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); if (ret) { + /* + * Delete us from the list. After we unlock the space + * info, we don't want the async reclaim job to reserve + * space for this ticket. If that would happen, then the + * ticket's task would not known that space was reserved + * despite getting an error, resulting in a space leak + * (bytes_may_use counter of our space_info). + */ + list_del_init(&ticket->list); ticket->error = -EINTR; break; } @@ -945,12 +954,24 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); ret = ticket->error; if (ticket->bytes || ticket->error) { + /* + * Need to delete here for priority tickets. For regular tickets + * either the async reclaim job deletes the ticket from the list + * or we delete it ourselves at wait_reserve_ticket(). + */ list_del_init(&ticket->list); if (!ret) ret = -ENOSPC; } spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); + /* + * Check that we can't have an error set if the reservation succeeded, + * as that would confuse tasks and lead them to error out without + * releasing reserved space (if an error happens the expectation is that + * space wasn't reserved at all). + */ + ASSERT(!(ticket->bytes == 0 && ticket->error)); return ret; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43e488f5d063..076d5b8014fb 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -686,9 +686,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { - struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dev_item *ditem; - u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { dev_item_err(leaf, slot, @@ -696,12 +694,6 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } - if (key->offset > max_devid) { - dev_item_err(leaf, slot, - "invalid devid: has=%llu expect=[0, %llu]", - key->offset, max_devid); - return -EUCLEAN; - } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (btrfs_device_id(leaf, ditem) != key->offset) { dev_item_err(leaf, slot, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bdfe4493e43a..e04409f85063 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4967,6 +4967,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; + devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d3b9c9d5c1bd..f5a38910a82b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1058,6 +1058,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + /* remove from inode's cap rbtree, and clear auth cap */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { @@ -1091,11 +1096,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) spin_unlock(&session->s_cap_lock); - /* remove from inode list */ - rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; - if (removed) ceph_put_cap(mdsc, cap); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4ca0b8ff9a72..d17a789fd856 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1553,36 +1553,37 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) { int valid = 0; struct dentry *parent; - struct inode *dir; + struct inode *dir, *inode; if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; + inode = d_inode_rcu(dentry); } else { parent = dget_parent(dentry); dir = d_inode(parent); + inode = d_inode(dentry); } dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, - dentry, d_inode(dentry), ceph_dentry(dentry)->offset); + dentry, inode, ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, - dentry, d_inode(dentry)); + dentry, inode); valid = 1; - } else if (d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { + } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { valid = 1; } else { valid = dentry_lease_is_valid(dentry, flags); if (valid == -ECHILD) return valid; if (valid || dir_lease_is_valid(dir, dentry)) { - if (d_really_is_positive(dentry)) - valid = ceph_is_any_caps(d_inode(dentry)); + if (inode) + valid = ceph_is_any_caps(inode); else valid = 1; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d277f71abe0b..8de633964dc3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -462,6 +462,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + } else if (!d_in_lookup(dentry)) { + /* If it's not being looked up, it's negative */ + return -ENOENT; } /* do the open */ @@ -750,6 +753,9 @@ static void ceph_aio_complete(struct inode *inode, if (!atomic_dec_and_test(&aio_req->pending_reqs)) return; + if (aio_req->iocb->ki_flags & IOCB_DIRECT) + inode_dio_end(inode); + ret = aio_req->error; if (!ret) ret = aio_req->total_len; @@ -1088,6 +1094,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, CEPH_CAP_FILE_RD); list_splice(&aio_req->osd_reqs, &osd_reqs); + inode_dio_begin(inode); while (!list_empty(&osd_reqs)) { req = list_first_entry(&osd_reqs, struct ceph_osd_request, @@ -1261,14 +1268,24 @@ again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_read(inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); - if (ret < 0) + if (ret < 0) { + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); return ret; + } if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || @@ -1280,16 +1297,12 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { - ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); - ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { - ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); - ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1300,11 +1313,10 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); - ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); - ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); if (pinned_page) { @@ -1312,6 +1324,12 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); + + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; @@ -1956,10 +1974,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; + /* + * Striped file layouts require that we copy partial objects, but the + * OSD copy-from operation only supports full-object copies. Limit + * this to non-striped file layouts for now. + */ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || - (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) || - (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) + (src_ci->i_layout.stripe_count != 1) || + (dst_ci->i_layout.stripe_count != 1) || + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { + dout("Invalid src/dst files layout\n"); return -EOPNOTSUPP; + } if (len < src_ci->i_layout.object_size) return -EOPNOTSUPP; /* no remote copy will be done */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9f135624ae47..c07407586ce8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1434,6 +1434,7 @@ retry_lookup: dout(" final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct inode *dir = req->r_parent; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index edfd643a8205..b47f43fc2d68 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -268,6 +268,7 @@ static int parse_fsopt_token(char *c, void *private) } break; case Opt_fscache_uniq: +#ifdef CONFIG_CEPH_FSCACHE kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, @@ -276,7 +277,10 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; break; - /* misc */ +#else + pr_err("fscache support is disabled\n"); + return -EINVAL; +#endif case Opt_wsize: if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) return -EINVAL; @@ -353,10 +357,15 @@ static int parse_fsopt_token(char *c, void *private) fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; case Opt_fscache: +#ifdef CONFIG_CEPH_FSCACHE fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = NULL; break; +#else + pr_err("fscache support is disabled\n"); + return -EINVAL; +#endif case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; kfree(fsopt->fscache_uniq); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ea735d59c36e..0abfde6d0b05 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -838,6 +838,7 @@ struct create_durable_handle_reconnect_v2 { struct create_context ccontext; __u8 Name[8]; struct durable_reconnect_context_v2 dcontext; + __u8 Pad[4]; } __packed; /* See MS-SMB2 2.2.13.2.5 */ diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index dc5dbf6a81d7..cb61467478ca 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -101,7 +101,7 @@ static int create_link(struct config_item *parent_item, } target_sd->s_links++; spin_unlock(&configfs_dirent_lock); - ret = configfs_get_target_path(item, item, body); + ret = configfs_get_target_path(parent_item, item, body); if (!ret) ret = configfs_create_link(target_sd, parent_item->ci_dentry, dentry, body); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 18426f4855f1..e23752d9a79f 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -128,13 +128,20 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *inode) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); struct dentry *lower_dir_dentry; + struct inode *lower_dir_inode; int rc; - dget(lower_dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + lower_dir_inode = d_inode(lower_dir_dentry); + inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + dget(lower_dentry); // don't even try to make the lower negative + if (lower_dentry->d_parent != lower_dir_dentry) + rc = -EINVAL; + else if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -142,10 +149,11 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, fsstack_copy_attr_times(dir, lower_dir_inode); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode->i_ctime = dir->i_ctime; - d_drop(dentry); out_unlock: - unlock_dir(lower_dir_dentry); dput(lower_dentry); + inode_unlock(lower_dir_inode); + if (!rc) + d_drop(dentry); return rc; } @@ -311,9 +319,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - struct inode *inode, *lower_inode = d_inode(lower_dentry); + struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct inode *inode, *lower_inode; struct ecryptfs_dentry_info *dentry_info; - struct vfsmount *lower_mnt; int rc = 0; dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); @@ -322,16 +330,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, return ERR_PTR(-ENOMEM); } - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(lower_dentry->d_parent)); + d_inode(path->dentry)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = lower_mnt; + dentry_info->lower_path.mnt = mntget(path->mnt); dentry_info->lower_path.dentry = lower_dentry; - if (d_really_is_negative(lower_dentry)) { + /* + * negative dentry can go positive under us here - its parent is not + * locked. That's OK and that could happen just as we return from + * ecryptfs_lookup() anyway. Just need to be careful and fetch + * ->d_inode only once - it's not stable here. + */ + lower_inode = READ_ONCE(lower_dentry->d_inode); + + if (!lower_inode) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); return NULL; @@ -512,22 +527,30 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) { struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *lower_dir_inode; int rc; lower_dentry = ecryptfs_dentry_to_lower(dentry); - dget(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - dget(lower_dentry); - rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); - dput(lower_dentry); - if (!rc && d_really_is_positive(dentry)) + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + lower_dir_inode = d_inode(lower_dir_dentry); + + inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + dget(lower_dentry); // don't even try to make the lower negative + if (lower_dentry->d_parent != lower_dir_dentry) + rc = -EINVAL; + else if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_rmdir(lower_dir_inode, lower_dentry); + if (!rc) { clear_nlink(d_inode(dentry)); - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); - unlock_dir(lower_dir_dentry); + fsstack_copy_attr_times(dir, lower_dir_inode); + set_nlink(dir, lower_dir_inode->i_nlink); + } + dput(lower_dentry); + inode_unlock(lower_dir_inode); if (!rc) d_drop(dentry); - dput(dentry); return rc; } @@ -565,20 +588,22 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; - struct dentry *trap = NULL; + struct dentry *trap; struct inode *target_inode; if (flags) return -EINVAL; + lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent); + lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); - dget(lower_old_dentry); - dget(lower_new_dentry); - lower_old_dir_dentry = dget_parent(lower_old_dentry); - lower_new_dir_dentry = dget_parent(lower_new_dentry); + target_inode = d_inode(new_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dget(lower_new_dentry); rc = -EINVAL; if (lower_old_dentry->d_parent != lower_old_dir_dentry) goto out_lock; @@ -606,11 +631,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - dput(lower_new_dir_dentry); - dput(lower_old_dir_dentry); dput(lower_new_dentry); - dput(lower_old_dentry); + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); return rc; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 09bc68708d28..2dd55b172d57 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -519,26 +519,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, * inode is actually connected to the parent. */ err = exportfs_get_name(mnt, target_dir, nbuf, result); - if (!err) { - inode_lock(target_dir->d_inode); - nresult = lookup_one_len(nbuf, target_dir, - strlen(nbuf)); - inode_unlock(target_dir->d_inode); - if (!IS_ERR(nresult)) { - if (nresult->d_inode) { - dput(result); - result = nresult; - } else - dput(nresult); - } + if (err) { + dput(target_dir); + goto err_result; } + inode_lock(target_dir->d_inode); + nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); + if (!IS_ERR(nresult)) { + if (unlikely(nresult->d_inode != result->d_inode)) { + dput(nresult); + nresult = ERR_PTR(-ESTALE); + } + } + inode_unlock(target_dir->d_inode); /* * At this point we are done with the parent, but it's pinned * by the child dentry anyway. */ dput(target_dir); + if (IS_ERR(nresult)) { + err = PTR_ERR(nresult); + goto err_result; + } + dput(result); + result = nresult; + /* * And finally make sure the dentry is actually acceptable * to NFSD. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8461a6322039..335607b8c5c0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -576,10 +576,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, spin_unlock(&inode->i_lock); /* - * A dying wb indicates that the memcg-blkcg mapping has changed - * and a new wb is already serving the memcg. Switch immediately. + * A dying wb indicates that either the blkcg associated with the + * memcg changed or the associated memcg is dying. In the first + * case, a replacement wb should already be available and we should + * refresh the wb immediately. In the second case, trying to + * refresh will keep failing. */ - if (unlikely(wb_dying(wbc->wb))) + if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); diff --git a/fs/io_uring.c b/fs/io_uring.c index f9a38998f2fc..2c819c3c855d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -326,6 +326,7 @@ struct io_kiocb { #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ +#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ u64 user_data; u32 result; u32 sequence; @@ -453,9 +454,13 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req && !__io_sequence_defer(ctx, req)) { - list_del_init(&req->list); - return req; + if (req) { + if (req->flags & REQ_F_TIMEOUT_NOSEQ) + return NULL; + if (!__io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } } return NULL; @@ -1225,7 +1230,7 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, } } - return 0; + return len; } static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, @@ -1941,18 +1946,24 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + req->flags |= REQ_F_TIMEOUT; + /* * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. + * timeout event to be satisfied. If it isn't set, then this is + * a pure timeout request, sequence isn't used. */ count = READ_ONCE(sqe->off); - if (!count) - count = 1; + if (!count) { + req->flags |= REQ_F_TIMEOUT_NOSEQ; + spin_lock_irq(&ctx->completion_lock); + entry = ctx->timeout_list.prev; + goto add; + } req->sequence = ctx->cached_sq_head + count - 1; /* reuse it to store the count */ req->submit.sequence = count; - req->flags |= REQ_F_TIMEOUT; /* * Insertion sort, ensuring the first entry in the list is always @@ -1964,6 +1975,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned nxt_sq_head; long long tmp, tmp_nxt; + if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + continue; + /* * Since cached_sq_head + count - 1 can overflow, use type long * long to store it. @@ -1990,6 +2004,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) nxt->sequence++; } req->sequence -= span; +add: list_add(&req->list, entry); spin_unlock_irq(&ctx->completion_lock); @@ -2283,6 +2298,7 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe) switch (op) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: + case IORING_OP_TIMEOUT: return false; default: return true; diff --git a/fs/namespace.c b/fs/namespace.c index fe0e9e1410fe..2adfe7b166a3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2478,8 +2478,10 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * time64_to_tm(sb->s_time_max, 0, &tm); - pr_warn("Mounted %s file system at %s supports timestamps until %04ld (0x%llx)\n", - sb->s_type->name, mntpath, + pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n", + sb->s_type->name, + is_mounted(mnt) ? "remounted" : "mounted", + mntpath, tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); @@ -2764,14 +2766,11 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, if (IS_ERR(mnt)) return PTR_ERR(mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); - if (error < 0) { - mntput(mnt); - return error; - } - mnt_warn_timestamp_expiry(mountpoint, mnt); + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); return error; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 53939bf9d7d2..9876db52913a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2098,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) return 0; } -static int ocfs2_prepare_inode_for_refcount(struct inode *inode, - struct file *file, - loff_t pos, size_t count, - int *meta_level) +static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int overwrite_io, + int write_sem, + int wait) { - int ret; - struct buffer_head *di_bh = NULL; - u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; - u32 clusters = - ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + int ret = 0; - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : di_bh, meta_level); + if (ret < 0) goto out; + + if (wait) { + if (write_sem) + down_write(&OCFS2_I(inode)->ip_alloc_sem); + else + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + if (write_sem) + ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); + else + ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); + + if (!ret) { + ret = -EAGAIN; + goto out_unlock; + } } - *meta_level = 1; + return ret; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); - if (ret) - mlog_errno(ret); +out_unlock: + brelse(*di_bh); + ocfs2_inode_unlock(inode, meta_level); out: - brelse(di_bh); return ret; } +static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int write_sem) +{ + if (write_sem) + up_write(&OCFS2_I(inode)->ip_alloc_sem); + else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(*di_bh); + *di_bh = NULL; + + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); +} + static int ocfs2_prepare_inode_for_write(struct file *file, loff_t pos, size_t count, int wait) { int ret = 0, meta_level = 0, overwrite_io = 0; + int write_sem = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); struct buffer_head *di_bh = NULL; + u32 cpos; + u32 clusters; /* * We start with a read level meta lock and only jump to an ex * if we need to make modifications here. */ for(;;) { - if (wait) - ret = ocfs2_inode_lock(inode, NULL, meta_level); - else - ret = ocfs2_try_inode_lock(inode, - overwrite_io ? NULL : &di_bh, meta_level); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + overwrite_io, + write_sem, + wait); if (ret < 0) { - meta_level = -1; if (ret != -EAGAIN) mlog_errno(ret); goto out; @@ -2156,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ if (!wait && !overwrite_io) { overwrite_io = 1; - if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { - ret = -EAGAIN; - goto out_unlock; - } ret = ocfs2_overwrite_io(inode, di_bh, pos, count); - brelse(di_bh); - di_bh = NULL; - up_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) { if (ret != -EAGAIN) mlog_errno(ret); @@ -2183,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * set inode->i_size at the end of a write. */ if (should_remove_suid(dentry)) { if (meta_level == 0) { - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); meta_level = 1; continue; } @@ -2197,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file, ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { - ocfs2_inode_unlock(inode, meta_level); - meta_level = -1; - - ret = ocfs2_prepare_inode_for_refcount(inode, - file, - pos, - count, - &meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + overwrite_io, + 1, + wait); + write_sem = 1; + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out; + } + + cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); } if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_unlock; } @@ -2219,10 +2265,10 @@ out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, pos, count, wait); - brelse(di_bh); - - if (meta_level >= 0) - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); out: return ret; |