diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/afs/dir.c | 108 | ||||
-rw-r--r-- | fs/afs/dir_silly.c | 22 | ||||
-rw-r--r-- | fs/afs/fsclient.c | 27 | ||||
-rw-r--r-- | fs/afs/yfsclient.c | 26 | ||||
-rw-r--r-- | fs/btrfs/block-group.c | 1 | ||||
-rw-r--r-- | fs/btrfs/file.c | 15 | ||||
-rw-r--r-- | fs/btrfs/reflink.c | 1 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 23 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 20 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 93 | ||||
-rw-r--r-- | fs/ceph/dir.c | 4 | ||||
-rw-r--r-- | fs/ceph/file.c | 4 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 2 | ||||
-rw-r--r-- | fs/io_uring.c | 301 | ||||
-rw-r--r-- | fs/nfs/pnfs.c | 3 |
15 files changed, 345 insertions, 305 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 5c794f4b051a..d1e1caa23c8b 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1032,7 +1032,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent; struct inode *inode; struct key *key; - afs_dataversion_t dir_version; + afs_dataversion_t dir_version, invalid_before; long de_version; int ret; @@ -1084,8 +1084,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (de_version == (long)dir_version) goto out_valid_noupdate; - dir_version = dir->invalid_before; - if (de_version - (long)dir_version >= 0) + invalid_before = dir->invalid_before; + if (de_version - (long)invalid_before >= 0) goto out_valid; _debug("dir modified"); @@ -1275,6 +1275,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct afs_fs_cursor fc; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; + afs_dataversion_t data_version; int ret; mode |= S_IFDIR; @@ -1295,7 +1296,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t data_version = dvnode->status.data_version + 1; + data_version = dvnode->status.data_version + 1; while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); @@ -1316,10 +1317,14 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto error_key; } - if (ret == 0 && - test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, - afs_edit_dir_for_create); + if (ret == 0) { + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == data_version) + afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, + afs_edit_dir_for_create); + up_write(&dvnode->validate_lock); + } key_put(key); kfree(scb); @@ -1360,6 +1365,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) struct afs_fs_cursor fc; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; struct key *key; + afs_dataversion_t data_version; int ret; _enter("{%llx:%llu},{%pd}", @@ -1391,7 +1397,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t data_version = dvnode->status.data_version + 1; + data_version = dvnode->status.data_version + 1; while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); @@ -1404,9 +1410,12 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) ret = afs_end_vnode_operation(&fc); if (ret == 0) { afs_dir_remove_subdir(dentry); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == data_version) afs_edit_dir_remove(dvnode, &dentry->d_name, afs_edit_dir_for_rmdir); + up_write(&dvnode->validate_lock); } } @@ -1544,10 +1553,15 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = afs_end_vnode_operation(&fc); if (ret == 0 && !(scb[1].have_status || scb[1].have_error)) ret = afs_dir_remove_link(dvnode, dentry, key); - if (ret == 0 && - test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_edit_dir_remove(dvnode, &dentry->d_name, - afs_edit_dir_for_unlink); + + if (ret == 0) { + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == data_version) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_unlink); + up_write(&dvnode->validate_lock); + } } if (need_rehash && ret < 0 && ret != -ENOENT) @@ -1573,6 +1587,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct afs_status_cb *scb; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; + afs_dataversion_t data_version; int ret; mode |= S_IFREG; @@ -1597,7 +1612,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t data_version = dvnode->status.data_version + 1; + data_version = dvnode->status.data_version + 1; while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); @@ -1618,9 +1633,12 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto error_key; } - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == data_version) afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, afs_edit_dir_for_create); + up_write(&dvnode->validate_lock); kfree(scb); key_put(key); @@ -1648,6 +1666,7 @@ static int afs_link(struct dentry *from, struct inode *dir, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_vnode *vnode = AFS_FS_I(d_inode(from)); struct key *key; + afs_dataversion_t data_version; int ret; _enter("{%llx:%llu},{%llx:%llu},{%pd}", @@ -1672,7 +1691,7 @@ static int afs_link(struct dentry *from, struct inode *dir, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t data_version = dvnode->status.data_version + 1; + data_version = dvnode->status.data_version + 1; if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) { afs_end_vnode_operation(&fc); @@ -1702,9 +1721,12 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error_key; } - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == data_version) afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid, afs_edit_dir_for_link); + up_write(&dvnode->validate_lock); key_put(key); kfree(scb); @@ -1732,6 +1754,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, struct afs_status_cb *scb; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; + afs_dataversion_t data_version; int ret; _enter("{%llx:%llu},{%pd},%s", @@ -1759,7 +1782,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t data_version = dvnode->status.data_version + 1; + data_version = dvnode->status.data_version + 1; while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); @@ -1780,9 +1803,12 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, goto error_key; } - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == data_version) afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid, afs_edit_dir_for_symlink); + up_write(&dvnode->validate_lock); key_put(key); kfree(scb); @@ -1812,6 +1838,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *tmp = NULL, *rehash = NULL; struct inode *new_inode; struct key *key; + afs_dataversion_t orig_data_version; + afs_dataversion_t new_data_version; bool new_negative = d_is_negative(new_dentry); int ret; @@ -1890,10 +1918,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) { - afs_dataversion_t orig_data_version; - afs_dataversion_t new_data_version; - struct afs_status_cb *new_scb = &scb[1]; - orig_data_version = orig_dvnode->status.data_version + 1; if (orig_dvnode != new_dvnode) { @@ -1904,7 +1928,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, new_data_version = new_dvnode->status.data_version + 1; } else { new_data_version = orig_data_version; - new_scb = &scb[0]; } while (afs_select_fileserver(&fc)) { @@ -1912,7 +1935,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode); afs_fs_rename(&fc, old_dentry->d_name.name, new_dvnode, new_dentry->d_name.name, - &scb[0], new_scb); + &scb[0], &scb[1]); } afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break, @@ -1930,18 +1953,25 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret == 0) { if (rehash) d_rehash(rehash); - if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags)) - afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, - afs_edit_dir_for_rename_0); + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_data_version) + afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, + afs_edit_dir_for_rename_0); + if (orig_dvnode != new_dvnode) { + up_write(&orig_dvnode->validate_lock); - if (!new_negative && - test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) - afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, - afs_edit_dir_for_rename_1); + down_write(&new_dvnode->validate_lock); + } + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + orig_dvnode->status.data_version == new_data_version) { + if (!new_negative) + afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, + afs_edit_dir_for_rename_1); - if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) afs_edit_dir_add(new_dvnode, &new_dentry->d_name, &vnode->fid, afs_edit_dir_for_rename_2); + } new_inode = d_inode(new_dentry); if (new_inode) { @@ -1957,14 +1987,10 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, * Note that if we ever implement RENAME_EXCHANGE, we'll have * to update both dentries with opposing dir versions. */ - if (new_dvnode != orig_dvnode) { - afs_update_dentry_version(&fc, old_dentry, &scb[1]); - afs_update_dentry_version(&fc, new_dentry, &scb[1]); - } else { - afs_update_dentry_version(&fc, old_dentry, &scb[0]); - afs_update_dentry_version(&fc, new_dentry, &scb[0]); - } + afs_update_dentry_version(&fc, old_dentry, &scb[1]); + afs_update_dentry_version(&fc, new_dentry, &scb[1]); d_move(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); goto error_tmp; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 361088a5edb9..d94e2b7cddff 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -21,6 +21,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode { struct afs_fs_cursor fc; struct afs_status_cb *scb; + afs_dataversion_t dir_data_version; int ret = -ERESTARTSYS; _enter("%pd,%pd", old, new); @@ -31,7 +32,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode trace_afs_silly_rename(vnode, false); if (afs_begin_vnode_operation(&fc, dvnode, key, true)) { - afs_dataversion_t dir_data_version = dvnode->status.data_version + 1; + dir_data_version = dvnode->status.data_version + 1; while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); @@ -54,12 +55,15 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode dvnode->silly_key = key_get(key); } - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dir_data_version) { afs_edit_dir_remove(dvnode, &old->d_name, afs_edit_dir_for_silly_0); - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_edit_dir_add(dvnode, &new->d_name, &vnode->fid, afs_edit_dir_for_silly_1); + } + up_write(&dvnode->validate_lock); } kfree(scb); @@ -181,10 +185,14 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } } - if (ret == 0 && - test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_edit_dir_remove(dvnode, &dentry->d_name, - afs_edit_dir_for_unlink); + if (ret == 0) { + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dir_data_version) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_unlink); + up_write(&dvnode->validate_lock); + } } kfree(scb); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1f9c5d8e6fe5..68fc46634346 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -65,6 +65,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus); u64 data_version, size; u32 type, abort_code; + int ret; abort_code = ntohl(xdr->abort_code); @@ -78,7 +79,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, */ status->abort_code = abort_code; scb->have_error = true; - return 0; + goto good; } pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); @@ -87,7 +88,8 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, if (abort_code != 0 && inline_error) { status->abort_code = abort_code; - return 0; + scb->have_error = true; + goto good; } type = ntohl(xdr->type); @@ -123,13 +125,16 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp, data_version |= (u64)ntohl(xdr->data_version_hi) << 32; status->data_version = data_version; scb->have_status = true; - +good: + ret = 0; +advance: *_bp = (const void *)*_bp + sizeof(*xdr); - return 0; + return ret; bad: xdr_dump_bad(*_bp); - return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + goto advance; } static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry) @@ -981,16 +986,16 @@ static int afs_deliver_fs_rename(struct afs_call *call) if (ret < 0) return ret; - /* unmarshall the reply once we've received all of it */ + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ bp = call->buffer; ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb); if (ret < 0) return ret; - if (call->out_dir_scb != call->out_scb) { - ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - } + ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb); + if (ret < 0) + return ret; xdr_decode_AFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index a26126ac7bf1..b5b45c57e1b1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -165,15 +165,15 @@ static void xdr_dump_bad(const __be32 *bp) int i; pr_notice("YFS XDR: Bad status record\n"); - for (i = 0; i < 5 * 4 * 4; i += 16) { + for (i = 0; i < 6 * 4 * 4; i += 16) { memcpy(x, bp, 16); bp += 4; pr_notice("%03x: %08x %08x %08x %08x\n", i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); } - memcpy(x, bp, 4); - pr_notice("0x50: %08x\n", ntohl(x[0])); + memcpy(x, bp, 8); + pr_notice("0x60: %08x %08x\n", ntohl(x[0]), ntohl(x[1])); } /* @@ -186,13 +186,14 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp, const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp; struct afs_file_status *status = &scb->status; u32 type; + int ret; status->abort_code = ntohl(xdr->abort_code); if (status->abort_code != 0) { if (status->abort_code == VNOVNODE) status->nlink = 0; scb->have_error = true; - return 0; + goto good; } type = ntohl(xdr->type); @@ -220,13 +221,16 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp, status->size = xdr_to_u64(xdr->size); status->data_version = xdr_to_u64(xdr->data_version); scb->have_status = true; - +good: + ret = 0; +advance: *_bp += xdr_size(xdr); - return 0; + return ret; bad: xdr_dump_bad(*_bp); - return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); + goto advance; } /* @@ -1153,11 +1157,9 @@ static int yfs_deliver_fs_rename(struct afs_call *call) ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb); if (ret < 0) return ret; - if (call->out_dir_scb != call->out_scb) { - ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); - if (ret < 0) - return ret; - } + ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb); + if (ret < 0) + return ret; xdr_decode_YFSVolSync(&bp, call->out_volsync); _leave(" = 0 [done]"); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 786849fcc319..47f66c6a7d7f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3370,6 +3370,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8a144f9cb7ac..719e68ab552c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2098,6 +2098,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges and races between logging and + * completion of ordered extents for adjancent ranges - both races + * could lead to file extent items in the log with overlapping ranges. + * Do this while holding the inode lock, to avoid races with other + * tasks. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index d1973141d3bb..040009d1cc31 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -264,6 +264,7 @@ copy_inline_extent: size); inode_add_bytes(dst, datal); set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f65595602aa8..d35936c934ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -611,8 +611,8 @@ static int should_ignore_root(struct btrfs_root *root) if (!reloc_root) return 0; - if (btrfs_root_last_snapshot(&reloc_root->root_item) == - root->fs_info->running_transaction->transid - 1) + if (btrfs_header_generation(reloc_root->commit_root) == + root->fs_info->running_transaction->transid) return 0; /* * if there is reloc tree and it was created in previous @@ -1527,8 +1527,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; - if (!rc || !rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (!rc) return 0; /* @@ -1538,12 +1537,28 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, if (reloc_root_is_dead(root)) return 0; + /* + * This is subtle but important. We do not do + * record_root_in_transaction for reloc roots, instead we record their + * corresponding fs root, and then here we update the last trans for the + * reloc root. This means that we have to do this for the entire life + * of the reloc root, regardless of which stage of the relocation we are + * in. + */ if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; return 0; } + /* + * We are merging reloc roots, we do not need new reloc trees. Also + * reloc trees never need their own reloc tree. + */ + if (!rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8b0fe053a25d..ff17a4420358 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -361,6 +361,16 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; } +static void remove_ticket(struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + if (!list_empty(&ticket->list)) { + list_del_init(&ticket->list); + ASSERT(space_info->reclaim_size >= ticket->bytes); + space_info->reclaim_size -= ticket->bytes; + } +} + /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. @@ -388,9 +398,7 @@ again: btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); - list_del_init(&ticket->list); - ASSERT(space_info->reclaim_size >= ticket->bytes); - space_info->reclaim_size -= ticket->bytes; + remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -899,7 +907,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); - list_del_init(&ticket->list); + remove_ticket(space_info, ticket); ticket->error = -ENOSPC; wake_up(&ticket->wait); @@ -1063,7 +1071,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, * despite getting an error, resulting in a space leak * (bytes_may_use counter of our space_info). */ - list_del_init(&ticket->list); + remove_ticket(space_info, ticket); ticket->error = -EINTR; break; } @@ -1121,7 +1129,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * either the async reclaim job deletes the ticket from the list * or we delete it ourselves at wait_reserve_ticket(). */ - list_del_init(&ticket->list); + remove_ticket(space_info, ticket); if (!ret) ret = -ENOSPC; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 58c111474ba5..ec36a7c6ba3d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,8 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - u64 start, - u64 end, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4533,15 +4533,13 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, static int btrfs_log_holes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, - const u64 start, - const u64 end) + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); - u64 prev_extent_end = start; + u64 prev_extent_end = 0; int ret; if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) @@ -4549,21 +4547,14 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; + key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret > 0 && path->slots[0] > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - while (true) { struct extent_buffer *leaf = path->nodes[0]; - u64 extent_end; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); @@ -4580,18 +4571,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; - extent_end = btrfs_file_extent_end(path); - if (extent_end <= start) - goto next_slot; - /* We have a hole, log it. */ if (prev_extent_end < key.offset) { - u64 hole_len; - - if (key.offset >= end) - hole_len = end - prev_extent_end; - else - hole_len = key.offset - prev_extent_end; + const u64 hole_len = key.offset - prev_extent_end; /* * Release the path to avoid deadlocks with other code @@ -4621,20 +4603,16 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; } - prev_extent_end = min(extent_end, end); - if (extent_end >= end) - break; -next_slot: + prev_extent_end = btrfs_file_extent_end(path); path->slots[0]++; cond_resched(); } - if (prev_extent_end < end && prev_extent_end < i_size) { + if (prev_extent_end < i_size) { u64 hole_len; btrfs_release_path(path); - hole_len = min(ALIGN(i_size, fs_info->sectorsize), end); - hole_len -= prev_extent_end; + hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); ret = btrfs_insert_file_extent(trans, root->log_root, ino, prev_extent_end, 0, 0, hole_len, 0, hole_len, @@ -4971,8 +4949,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, const u64 logged_isize, const bool recursive_logging, const int inode_only, - const u64 start, - const u64 end, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { @@ -4981,21 +4957,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, int ins_nr = 0; int ret; - /* - * We must make sure we don't copy extent items that are entirely out of - * the range [start, end - 1]. This is not just an optimization to avoid - * copying but also needed to avoid a corruption where we end up with - * file extent items in the log tree that have overlapping ranges - this - * can happen if we race with ordered extent completion for ranges that - * are outside our target range. For example we copy an extent item and - * when we move to the next leaf, that extent was trimmed and a new one - * covering a subrange of it, but with a higher key, was inserted - we - * would then copy this other extent too, resulting in a log tree with - * 2 extent items that represent overlapping ranges. - * - * We can copy the entire extents at the range bondaries however, even - * if they cover an area outside the target range. That's ok. - */ while (1) { ret = btrfs_search_forward(root, min_key, path, trans->transid); if (ret < 0) @@ -5063,29 +5024,6 @@ again: goto next_slot; } - if (min_key->type == BTRFS_EXTENT_DATA_KEY) { - const u64 extent_end = btrfs_file_extent_end(path); - - if (extent_end <= start) { - if (ins_nr > 0) { - ret = copy_items(trans, inode, dst_path, - path, ins_start_slot, - ins_nr, inode_only, - logged_isize); - if (ret < 0) - return ret; - ins_nr = 0; - } - goto next_slot; - } - if (extent_end >= end) { - ins_nr++; - if (ins_nr == 1) - ins_start_slot = path->slots[0]; - break; - } - } - if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; @@ -5151,8 +5089,8 @@ next_key: static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - u64 start, - u64 end, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -5180,9 +5118,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return -ENOMEM; } - start = ALIGN_DOWN(start, fs_info->sectorsize); - end = ALIGN(end, fs_info->sectorsize); - min_key.objectid = ino; min_key.type = BTRFS_INODE_ITEM_KEY; min_key.offset = 0; @@ -5298,8 +5233,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, - recursive_logging, inode_only, - start, end, ctx, &need_log_inode_item); + recursive_logging, inode_only, ctx, + &need_log_inode_item); if (err) goto out_unlock; @@ -5312,7 +5247,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path, start, end); + err = btrfs_log_holes(trans, root, inode, path); if (err) goto out_unlock; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d594c2627430..4c4202c93b71 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1051,8 +1051,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen; - u64 base; + int pathlen = 0; + u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4a5ccbb7e808..afdfca965a7f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -527,8 +527,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, if (result) { struct dentry *dentry = req->r_dentry; - int pathlen; - u64 base; + int pathlen = 0; + u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4e5be79bf080..903d9edfd4bf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -521,7 +521,7 @@ extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); static inline void ceph_mdsc_free_path(char *path, int len) { - if (path) + if (!IS_ERR_OR_NULL(path)) __putname(path - (PATH_MAX - 1 - len)); } diff --git a/fs/io_uring.c b/fs/io_uring.c index 5190bfb6a665..381d50becd04 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -357,7 +357,6 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; - u32 seq_offset; }; struct io_accept { @@ -385,7 +384,7 @@ struct io_timeout { struct file *file; u64 addr; int flags; - unsigned count; + u32 count; }; struct io_rw { @@ -508,6 +507,7 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_LINK_HEAD_BIT, REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, @@ -543,6 +543,8 @@ enum { /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* head of a link */ + REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), /* already grabbed next link */ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ @@ -955,8 +957,8 @@ static inline bool __req_need_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped - + atomic_read(&ctx->cached_cq_overflow); + return req->sequence != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); } static inline bool req_need_defer(struct io_kiocb *req) @@ -1437,7 +1439,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); - req->flags &= ~REQ_F_LINK; + req->flags &= ~REQ_F_LINK_HEAD; io_put_req(req); return true; } @@ -1473,7 +1475,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) list_del_init(&req->link_list); if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK; + nxt->flags |= REQ_F_LINK_HEAD; *nxtptr = nxt; break; } @@ -1484,7 +1486,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) } /* - * Called if REQ_F_LINK is set, and we fail the head request + * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ static void io_fail_links(struct io_kiocb *req) { @@ -1517,7 +1519,7 @@ static void io_fail_links(struct io_kiocb *req) static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { - if (likely(!(req->flags & REQ_F_LINK))) + if (likely(!(req->flags & REQ_F_LINK_HEAD))) return; /* @@ -1669,7 +1671,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { - if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; if (!(req->flags & REQ_F_FIXED_FILE) || req->io) @@ -2562,7 +2564,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK) + if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; /* @@ -2653,7 +2655,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK) + if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; /* @@ -2760,7 +2762,7 @@ static bool io_splice_punt(struct file *file) return false; if (!io_file_supports_async(file)) return true; - return !(file->f_mode & O_NONBLOCK); + return !(file->f_flags & O_NONBLOCK); } static int io_splice(struct io_kiocb *req, bool force_nonblock) @@ -4153,20 +4155,57 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, return 1; } +static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) + __acquires(&req->ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!req->result && !READ_ONCE(poll->canceled)) { + struct poll_table_struct pt = { ._key = poll->events }; + + req->result = vfs_poll(req->file, &pt) & poll->events; + } + + spin_lock_irq(&ctx->completion_lock); + if (!req->result && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + return true; + } + + return false; +} + static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; + bool canceled; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); - WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); + if (io_poll_rewait(req, &apoll->poll)) { + spin_unlock_irq(&ctx->completion_lock); + return; + } - if (hash_hashed(&req->hash_node)) { - spin_lock_irq(&ctx->completion_lock); + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - spin_unlock_irq(&ctx->completion_lock); + + canceled = READ_ONCE(apoll->poll.canceled); + if (canceled) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(ctx); + } + + spin_unlock_irq(&ctx->completion_lock); + + if (canceled) { + kfree(apoll); + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_put_req(req); + return; } /* restore ->work in case we need to retry again */ @@ -4315,11 +4354,13 @@ static bool __io_poll_remove_one(struct io_kiocb *req, static bool io_poll_remove_one(struct io_kiocb *req) { + struct async_poll *apoll = NULL; bool do_complete; if (req->opcode == IORING_OP_POLL_ADD) { do_complete = __io_poll_remove_one(req, &req->poll); } else { + apoll = req->apoll; /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &req->apoll->poll); if (do_complete) @@ -4328,6 +4369,14 @@ static bool io_poll_remove_one(struct io_kiocb *req) hash_del(&req->hash_node); + if (apoll) { + /* + * restore ->work because we need to call io_req_work_drop_env. + */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + } + if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); @@ -4342,7 +4391,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) { struct hlist_node *tmp; struct io_kiocb *req; - int i; + int posted = 0, i; spin_lock_irq(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -4350,11 +4399,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) - io_poll_remove_one(req); + posted += io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + if (posted) + io_cqring_ev_posted(ctx); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -4423,18 +4473,11 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) struct io_ring_ctx *ctx = req->ctx; struct io_poll_iocb *poll = &req->poll; - if (!req->result && !READ_ONCE(poll->canceled)) { - struct poll_table_struct pt = { ._key = poll->events }; - - req->result = vfs_poll(req->file, &pt) & poll->events; - } - - spin_lock_irq(&ctx->completion_lock); - if (!req->result && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); + if (io_poll_rewait(req, poll)) { spin_unlock_irq(&ctx->completion_lock); return; } + hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; @@ -4665,11 +4708,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { - unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; struct list_head *entry; unsigned span = 0; + u32 count = req->timeout.count; + u32 seq = req->sequence; data = &req->io->timeout; @@ -4678,7 +4722,6 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = req->timeout.count; if (!count) { req->flags |= REQ_F_TIMEOUT_NOSEQ; spin_lock_irq(&ctx->completion_lock); @@ -4686,8 +4729,7 @@ static int io_timeout(struct io_kiocb *req) goto add; } - req->sequence = ctx->cached_sq_head + count - 1; - data->seq_offset = count; + req->sequence = seq + count; /* * Insertion sort, ensuring the first entry in the list is always @@ -4696,26 +4738,26 @@ static int io_timeout(struct io_kiocb *req) spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned nxt_sq_head; + unsigned nxt_seq; long long tmp, tmp_nxt; - u32 nxt_offset = nxt->io->timeout.seq_offset; + u32 nxt_offset = nxt->timeout.count; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; /* - * Since cached_sq_head + count - 1 can overflow, use type long + * Since seq + count can overflow, use type long * long to store it. */ - tmp = (long long)ctx->cached_sq_head + count - 1; - nxt_sq_head = nxt->sequence - nxt_offset + 1; - tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; + tmp = (long long)seq + count; + nxt_seq = nxt->sequence - nxt_offset; + tmp_nxt = (long long)nxt_seq + nxt_offset; /* * cached_sq_head may overflow, and it will never overflow twice * once there is some timeout req still be valid. */ - if (ctx->cached_sq_head < nxt_sq_head) + if (seq < nxt_seq) tmp += UINT_MAX; if (tmp > tmp_nxt) @@ -5476,7 +5518,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; - if (!(req->flags & REQ_F_LINK)) + if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; /* for polled retry, if flag is set, we already went through here */ if (req->flags & REQ_F_POLLED) @@ -5604,54 +5646,11 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_queue_sqe(req, NULL); } -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) - -static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; - unsigned int sqe_flags; - int ret, id, fd; - - sqe_flags = READ_ONCE(sqe->flags); - - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { - ret = -EINVAL; - goto err_req; - } - - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) { - ret = -EOPNOTSUPP; - goto err_req; - } - - id = READ_ONCE(sqe->personality); - if (id) { - req->work.creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.creds)) { - ret = -EINVAL; - goto err_req; - } - get_cred(req->work.creds); - } - - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | - IOSQE_ASYNC | IOSQE_FIXED_FILE | - IOSQE_BUFFER_SELECT); - - fd = READ_ONCE(sqe->fd); - ret = io_req_set_file(state, req, fd, sqe_flags); - if (unlikely(ret)) { -err_req: - io_cqring_add_event(req, ret); - io_double_put_req(req); - return false; - } + int ret; /* * If we already have a head request, queue this one for async @@ -5670,42 +5669,39 @@ err_req: * next after the link request. The last one is done via * drain_next flag to persist the effect across calls. */ - if (sqe_flags & IOSQE_IO_DRAIN) { + if (req->flags & REQ_F_IO_DRAIN) { head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) { - ret = -EAGAIN; - goto err_req; - } + if (io_alloc_async_ctx(req)) + return -EAGAIN; ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; - goto err_req; + return ret; } trace_io_uring_link(ctx, req, head); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { io_queue_link_head(head); *link = NULL; } } else { if (unlikely(ctx->drain_next)) { req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = 0; + ctx->drain_next = 0; } - if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) { - ret = -EAGAIN; - goto err_req; - } + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_req_defer_prep(req, sqe); if (ret) req->flags |= REQ_F_FAIL_LINK; @@ -5715,7 +5711,7 @@ err_req: } } - return true; + return 0; } /* @@ -5789,15 +5785,23 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) ctx->cached_sq_head++; } -static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) + +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe, + struct io_submit_state *state, bool async) { + unsigned int sqe_flags; + int id, fd; + /* * All io need record the previous position, if LINK vs DARIN, * it can be used to mark the position of the first IO in the * link list. */ - req->sequence = ctx->cached_sq_head; + req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; @@ -5808,17 +5812,50 @@ static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = NULL; req->result = 0; + req->needs_fixed_file = async; INIT_IO_WORK(&req->work, io_wq_submit_work); + + if (unlikely(req->opcode >= IORING_OP_LAST)) + return -EINVAL; + + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + use_mm(ctx->sqo_mm); + } + + sqe_flags = READ_ONCE(sqe->flags); + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) + return -EINVAL; + + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) + return -EOPNOTSUPP; + + id = READ_ONCE(sqe->personality); + if (id) { + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) + return -EINVAL; + get_cred(req->work.creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | + IOSQE_ASYNC | IOSQE_FIXED_FILE | + IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); + + fd = READ_ONCE(sqe->fd); + return io_req_set_file(state, req, fd, sqe_flags); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd, - struct mm_struct **mm, bool async) + struct file *ring_file, int ring_fd, bool async) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; int i, submitted = 0; - bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { @@ -5858,34 +5895,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - io_init_req(ctx, req, sqe); + err = io_init_req(ctx, req, sqe, statep, async); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; - if (unlikely(req->opcode >= IORING_OP_LAST)) { - err = -EINVAL; + if (unlikely(err)) { fail_req: io_cqring_add_event(req, err); io_double_put_req(req); break; } - if (io_op_defs[req->opcode].needs_mm && !*mm) { - mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); - if (unlikely(mm_fault)) { - err = -EFAULT; - goto fail_req; - } - use_mm(ctx->sqo_mm); - *mm = ctx->sqo_mm; - } - - req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, async); - if (!io_submit_sqe(req, sqe, statep, &link)) - break; + err = io_submit_sqe(req, sqe, statep, &link); + if (err) + goto fail_req; } if (unlikely(submitted != nr)) { @@ -5904,10 +5930,19 @@ fail_req: return submitted; } +static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + unuse_mm(mm); + mmput(mm); + } +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; - struct mm_struct *cur_mm = NULL; const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); @@ -5948,11 +5983,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - cur_mm = NULL; - } + io_sq_thread_drop_mm(ctx); /* * We're polling. If we're within the defined idle @@ -6016,7 +6047,7 @@ static int io_sq_thread(void *data) } mutex_lock(&ctx->uring_lock); - ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + ret = io_submit_sqes(ctx, to_submit, NULL, -1, true); mutex_unlock(&ctx->uring_lock); timeout = jiffies + ctx->sq_thread_idle; } @@ -6025,10 +6056,7 @@ static int io_sq_thread(void *data) task_work_run(); set_fs(old_fs); - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - } + io_sq_thread_drop_mm(ctx); revert_creds(old_cred); kthread_parkme(); @@ -7509,13 +7537,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - struct mm_struct *cur_mm; - mutex_lock(&ctx->uring_lock); - /* already have mm, so io_submit_sqes() won't try to grab it */ - cur_mm = ctx->sqo_mm; - submitted = io_submit_sqes(ctx, to_submit, f.file, fd, - &cur_mm, false); + submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f2dc35c22964..b8d78f393365 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2023,6 +2023,7 @@ lookup_again: goto lookup_again; } + spin_unlock(&ino->i_lock); first = true; status = nfs4_select_rw_stateid(ctx->state, iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, @@ -2032,12 +2033,12 @@ lookup_again: trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_INVALID_OPEN); - spin_unlock(&ino->i_lock); nfs4_schedule_stateid_recovery(server, ctx->state); pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); goto lookup_again; } + spin_lock(&ino->i_lock); } else { nfs4_stateid_copy(&stateid, &lo->plh_stateid); } |