diff options
Diffstat (limited to 'fs')
73 files changed, 1508 insertions, 750 deletions
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 4150280509ff..7503899c0a1b 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -136,6 +136,9 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr ASSERTCMP(d_inode(dentry), ==, NULL); + if (flags & LOOKUP_CREATE) + return ERR_PTR(-EOPNOTSUPP); + if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index f532d6d3bd28..79bc5f1338ed 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -126,7 +126,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (src_as->cell) ctx->cell = afs_get_cell(src_as->cell); - if (size > PAGE_SIZE - 1) + if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); @@ -140,7 +140,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) } buf = kmap(page); - ret = vfs_parse_fs_string(fc, "source", buf, size); + ret = -EINVAL; + if (buf[size - 1] == '.') + ret = vfs_parse_fs_string(fc, "source", buf, size - 1); kunmap(page); put_page(page); if (ret < 0) diff --git a/fs/afs/proc.c b/fs/afs/proc.c index fba2ec3a3a9c..468e1713bce1 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -213,13 +213,14 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) /* Display header on line 1 */ if (v == &cell->proc_volumes) { - seq_puts(m, "USE VID TY\n"); + seq_puts(m, "USE VID TY NAME\n"); return 0; } - seq_printf(m, "%3d %08llx %s\n", + seq_printf(m, "%3d %08llx %s %s\n", atomic_read(&vol->usage), vol->vid, - afs_vol_types[vol->type]); + afs_vol_types[vol->type], + vol->name); return 0; } diff --git a/fs/afs/server.c b/fs/afs/server.c index 1686bf188ccd..b7f3cb2130ca 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -32,18 +32,11 @@ static void afs_dec_servers_outstanding(struct afs_net *net) struct afs_server *afs_find_server(struct afs_net *net, const struct sockaddr_rxrpc *srx) { - const struct sockaddr_in6 *a = &srx->transport.sin6, *b; const struct afs_addr_list *alist; struct afs_server *server = NULL; unsigned int i; - bool ipv6 = true; int seq = 0, diff; - if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 || - srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 || - srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff)) - ipv6 = false; - rcu_read_lock(); do { @@ -52,7 +45,8 @@ struct afs_server *afs_find_server(struct afs_net *net, server = NULL; read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - if (ipv6) { + if (srx->transport.family == AF_INET6) { + const struct sockaddr_in6 *a = &srx->transport.sin6, *b; hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { alist = rcu_dereference(server->addresses); for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { @@ -68,15 +62,16 @@ struct afs_server *afs_find_server(struct afs_net *net, } } } else { + const struct sockaddr_in *a = &srx->transport.sin, *b; hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { alist = rcu_dereference(server->addresses); for (i = 0; i < alist->nr_ipv4; i++) { - b = &alist->addrs[i].transport.sin6; - diff = ((u16 __force)a->sin6_port - - (u16 __force)b->sin6_port); + b = &alist->addrs[i].transport.sin; + diff = ((u16 __force)a->sin_port - + (u16 __force)b->sin_port); if (diff == 0) - diff = ((u32 __force)a->sin6_addr.s6_addr32[3] - - (u32 __force)b->sin6_addr.s6_addr32[3]); + diff = ((u32 __force)a->sin_addr.s_addr - + (u32 __force)b->sin_addr.s_addr); if (diff == 0) goto found; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 488641b1a418..7f8a9b3137bf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -404,6 +404,7 @@ static int afs_test_super(struct super_block *sb, struct fs_context *fc) return (as->net_ns == fc->net_ns && as->volume && as->volume->vid == ctx->volume->vid && + as->cell == ctx->cell && !as->dyn_root); } @@ -448,7 +449,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { inode = afs_iget_pseudo_dir(sb, true); - sb->s_flags |= SB_RDONLY; } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 75b6d10c9845..575636f6491e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -7,6 +7,7 @@ config BTRFS_FS select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 + select CRYPTO_BLAKE2B select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b6e86aaf2e1..24658b5a5787 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -379,7 +379,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2e8fd8a8e59..54efb21c2727 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2787,7 +2787,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); + struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 153f71a5bba9..274318e9114e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1869,8 +1869,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, head->bytenr, - head->num_bytes); + ret = btrfs_del_csums(trans, fs_info->csum_root, + head->bytenr, head->num_bytes); } } @@ -3175,7 +3175,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info, bytenr, num_bytes); + ret = btrfs_del_csums(trans, info->csum_root, bytenr, + num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3799,6 +3800,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; + int cache_block_group_error = 0; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; @@ -3958,7 +3960,20 @@ have_block_group: if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); - BUG_ON(ret < 0); + + /* + * If we get ENOMEM here or something else we want to + * try other block groups, because it may not be fatal. + * However if we can't find anything else we need to + * save our return here so that we return the actual + * error that caused problems, not ENOSPC. + */ + if (ret < 0) { + if (!cache_block_group_error) + cache_block_group_error = ret; + ret = 0; + goto loop; + } ret = 0; } @@ -4045,7 +4060,7 @@ loop: if (ret > 0) goto search; - if (ret == -ENOSPC) { + if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. @@ -4056,6 +4071,8 @@ loop: space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl.max_extent_size; + } else if (ret == -ENOSPC) { + ret = cache_block_group_error; } return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb8bd0258360..2f4802f405a2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5074,12 +5074,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3270a40b0777..b1bfdc5c1387 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -590,9 +590,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, * range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) + struct btrfs_root *root, u64 bytenr, u64 len) { - struct btrfs_root *root = fs_info->csum_root; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; @@ -602,6 +602,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int blocksize_bits = fs_info->sb->s_blocksize_bits; + ASSERT(root == fs_info->csum_root || + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0cb43b682789..8d47c76b7bd1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2599,8 +2599,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - u64 clone_len = drop_end - cur_offset; + if (clone_info && drop_end > clone_info->file_offset) { + u64 clone_len = drop_end - clone_info->file_offset; ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, clone_len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 56032c518b26..e3c76645cad7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5728,7 +5728,6 @@ static void inode_tree_add(struct inode *inode) static void inode_tree_del(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int empty = 0; @@ -5741,7 +5740,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -9556,9 +9554,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_init_log_ctx(&ctx_dest, new_inode); /* close the race window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&fs_info->subvol_sem); - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* @@ -9792,9 +9789,8 @@ out_fail: ret = ret ? ret : ret2; } out_notrans: - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&fs_info->subvol_sem); - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); ASSERT(list_empty(&ctx_root.list)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1ee0b775e65..18e328ce4b54 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -704,11 +704,17 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); @@ -3720,24 +3726,18 @@ process_slot: ret = 0; if (last_dest_end < destoff + len) { - struct btrfs_clone_extent_info clone_info = { 0 }; /* - * We have an implicit hole (NO_HOLES feature is enabled) that - * fully or partially overlaps our cloning range at its end. + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); path->leave_spinning = 0; - /* - * We are dealing with a hole and our clone_info already has a - * disk_offset of 0, we only need to fill the data length and - * file offset. - */ - clone_info.data_len = destoff + len - last_dest_end; - clone_info.file_offset = last_dest_end; ret = btrfs_punch_hole_range(inode, path, last_dest_end, destoff + len - 1, - &clone_info, &trans); + NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93aeb2e539a4..d4282e12f2a6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3232,12 +3232,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup is not enabled"); + "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup rescan is not queued"); + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d897a8e5e430..c58245797f30 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4552,6 +4552,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ae2db5eb1549..091e5bc8c7ea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7084,12 +7084,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) spin_unlock(&send_root->root_item_lock); /* - * This is done when we lookup the root, it should already be complete - * by the time we get here. - */ - WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); - - /* * Userspace tools do the checks and warn the user if it's * not RO. */ diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1a846bf6e197..914eea5ba6a7 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -452,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 09aaca1efd62..ac035a6fa003 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) * *cough*backref walking code*cough* */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_err("couldn't allocate dummy buffer"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 493d4d9e0f79..97f3520b8d98 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -227,7 +227,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, */ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { file_extent_err(leaf, slot, - "invalid item size, have %u expect [%lu, %u)", + "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; @@ -332,7 +332,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, } static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, - int slot) + int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; @@ -356,6 +356,20 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + + prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (prev_csum_end > key->offset) { + generic_err(leaf, slot - 1, +"csum end range (%llu) goes beyond the start range (%llu) of the next csum item", + prev_csum_end, key->offset); + return -EUCLEAN; + } + } return 0; } @@ -1355,7 +1369,7 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_item(leaf, key, slot, prev_key); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(leaf, key, slot); + ret = check_csum_item(leaf, key, slot, prev_key); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6f757361db53..d3f115909ff0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -808,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_del_csums(trans, fs_info, + ret = btrfs_del_csums(trans, + fs_info->csum_root, sums->bytenr, sums->len); if (!ret) @@ -3909,6 +3910,28 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return 0; } +static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_ordered_sum *sums) +{ + int ret; + + /* + * Due to extent cloning, we might have logged a csum item that covers a + * subrange of a cloned extent, and later we can end up logging a csum + * item for a larger subrange of the same extent or the entire range. + * This would leave csum items in the log tree that cover the same range + * and break the searches for checksums in the log tree, resulting in + * some checksums missing in the fs/subvolume tree. So just delete (or + * trim and adjust) any existing csum items in the log for this range. + */ + ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + if (ret) + return ret; + + return btrfs_csum_file_blocks(trans, log_root, sums); +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, @@ -4054,7 +4077,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = log_csums(trans, log, sums); list_del(&sums->list); kfree(sums); } @@ -4274,7 +4297,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log_root, sums); + ret = log_csums(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -6294,9 +6317,28 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(fs_info, + log->node->start, + log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); + + if (!ret) + goto next; btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root for tree log recovery."); goto error; @@ -6328,7 +6370,6 @@ again: &root->highest_objectid); } - key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); free_extent_buffer(log->commit_root); @@ -6336,9 +6377,10 @@ again: if (ret) goto error; - +next: if (found_key.offset == 0) break; + key.offset = found_key.offset - 1; } btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 91caab63bdf5..76b84f2397b1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -324,6 +324,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8e5560db285..a6d3f08bfff3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -61,7 +61,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, @@ -73,7 +73,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f5a38910a82b..9d09bb53c1ab 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1011,18 +1011,13 @@ static int __ceph_is_single_caps(struct ceph_inode_info *ci) return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); } -static int __ceph_is_any_caps(struct ceph_inode_info *ci) -{ - return !RB_EMPTY_ROOT(&ci->i_caps); -} - int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); - ret = __ceph_is_any_caps(ci); + ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; @@ -1099,15 +1094,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - /* when reconnect denied, we remove session caps forcibly, - * i_wr_ref can be non-zero. If there are ongoing write, - * keep i_snap_realm. - */ - if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) { + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); - if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); + } } struct cap_msg_args { @@ -2764,7 +2760,19 @@ int ceph_get_caps(struct file *filp, int need, int want, if (ret == -EAGAIN) continue; if (!ret) { + struct ceph_mds_client *mdsc = fsc->mdsc; + struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); + + cw.ino = inode->i_ino; + cw.tgid = current->tgid; + cw.need = need; + cw.want = want; + + spin_lock(&mdsc->caps_list_lock); + list_add(&cw.list, &mdsc->cap_wait_list); + spin_unlock(&mdsc->caps_list_lock); + add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; @@ -2778,6 +2786,11 @@ int ceph_get_caps(struct file *filp, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); + + spin_lock(&mdsc->caps_list_lock); + list_del(&cw.list); + spin_unlock(&mdsc->caps_list_lock); + if (ret == -EAGAIN) continue; } @@ -2928,7 +2941,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index facb387c2735..c281f32b54f7 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; int total, avail, used, reserved, min, i; + struct cap_wait *cw; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" @@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p) } mutex_unlock(&mdsc->mutex); + seq_printf(s, "\n\nWaiters:\n--------\n"); + seq_printf(s, "tgid ino need want\n"); + seq_printf(s, "-----------------------------------------------------\n"); + + spin_lock(&mdsc->caps_list_lock); + list_for_each_entry(cw, &mdsc->cap_wait_list, list) { + seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino, + ceph_cap_string(cw->need), + ceph_cap_string(cw->want)); + } + spin_unlock(&mdsc->caps_list_lock); + return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 068b029cf073..374db1bd57d1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2015,7 +2015,7 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) if (!nr) return; val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); - if (!(val % CEPH_CAPS_PER_RELEASE)) { + if ((val % CEPH_CAPS_PER_RELEASE) < nr) { atomic_set(&mdsc->cap_reclaim_pending, 0); ceph_queue_cap_reclaim_work(mdsc); } @@ -2032,12 +2032,13 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); - int order, num_entries; + unsigned int num_entries; + int order; spin_lock(&ci->i_ceph_lock); num_entries = ci->i_files + ci->i_subdirs; spin_unlock(&ci->i_ceph_lock); - num_entries = max(num_entries, 1); + num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); order = get_order(size * num_entries); @@ -4168,6 +4169,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); + INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 5cd131b41d84..14c7e8c49970 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -340,6 +340,14 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +struct cap_wait { + struct list_head list; + unsigned long ino; + pid_t tgid; + int need; + int want; +}; + /* * mds client state */ @@ -416,6 +424,7 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ + struct list_head cap_wait_list; int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index aeec1d6e3769..471bac335fae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); @@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (err) goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr), - ceph_mds_state_name(state)); + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); if (mds < 0 || state <= 0) continue; @@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, @@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_damaged = false; } bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9c9a7c68eea3..29a795f975df 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -172,10 +172,10 @@ static const struct fs_parameter_enum ceph_mount_param_enums[] = { static const struct fs_parameter_spec ceph_mount_param_specs[] = { fsparam_flag_no ("acl", Opt_acl), fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), - fsparam_u32 ("caps_max", Opt_caps_max), + fsparam_s32 ("caps_max", Opt_caps_max), fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), - fsparam_s32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), fsparam_flag_no ("copyfrom", Opt_copyfrom), fsparam_flag_no ("dcache", Opt_dcache), fsparam_flag_no ("dirstat", Opt_dirstat), @@ -187,8 +187,8 @@ static const struct fs_parameter_spec ceph_mount_param_specs[] = { fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), fsparam_flag_no ("rbytes", Opt_rbytes), - fsparam_s32 ("readdir_max_bytes", Opt_readdir_max_bytes), - fsparam_s32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), fsparam_enum ("recover_session", Opt_recover_session), fsparam_flag_no ("require_active_mds", Opt_require_active_mds), fsparam_u32 ("rsize", Opt_rsize), @@ -328,7 +328,9 @@ static int ceph_parse_mount_param(struct fs_context *fc, fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: - fsopt->caps_max = result.uint_32; + if (result.int_32 < 0) + goto out_of_range; + fsopt->caps_max = result.int_32; break; case Opt_readdir_max_entries: if (result.uint_32 < 1) @@ -547,25 +549,25 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_show_option(m, "recover_session", "clean"); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) - seq_printf(m, ",wsize=%d", fsopt->wsize); + seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) - seq_printf(m, ",rsize=%d", fsopt->rsize); + seq_printf(m, ",rsize=%u", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) - seq_printf(m, ",rasize=%d", fsopt->rasize); + seq_printf(m, ",rasize=%u", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); if (fsopt->caps_max) seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", + seq_printf(m, ",caps_wanted_delay_min=%u", fsopt->caps_wanted_delay_min); if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", + seq_printf(m, ",caps_wanted_delay_max=%u", fsopt->caps_wanted_delay_max); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_show_option(m, "snapdirname", fsopt->snapdir_name); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f0f9cb7447ac..3bf1a01cd536 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -73,16 +73,16 @@ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ struct ceph_mount_options { - int flags; + unsigned int flags; - int wsize; /* max write size */ - int rsize; /* max read size */ - int rasize; /* max readahead */ - int congestion_kb; /* max writeback in flight */ - int caps_wanted_delay_min, caps_wanted_delay_max; + unsigned int wsize; /* max write size */ + unsigned int rsize; /* max read size */ + unsigned int rasize; /* max readahead */ + unsigned int congestion_kb; /* max writeback in flight */ + unsigned int caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; - int max_readdir; /* max readdir result (entires) */ - int max_readdir_bytes; /* max readdir result (bytes) */ + unsigned int max_readdir; /* max readdir result (entries) */ + unsigned int max_readdir_bytes; /* max readdir result (bytes) */ /* * everything above this point can be memcmp'd; everything below diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index fd0262ce5ad5..ce9bac756c2a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1061,7 +1061,7 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; - + bool has_lease:1; struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4f554f019a98..cc86a67225d1 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -42,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "smb2proto.h" #include "fscache.h" #include "smbdirect.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -112,6 +113,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; + /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ + close_shroot_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 18c7a33adceb..5ef5e97a6d13 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -95,6 +95,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } + memset(&oparms, 0, sizeof(struct cifs_open_parms)); oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = create_disposition; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a5c96bc522cb..6250370c1170 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -616,6 +616,7 @@ smb2_close_cached_fid(struct kref *ref) cfid->fid->volatile_fid); cfid->is_valid = false; cfid->file_all_info_is_valid = false; + cfid->has_lease = false; } } @@ -626,13 +627,28 @@ void close_shroot(struct cached_fid *cfid) mutex_unlock(&cfid->fid_mutex); } +void close_shroot_lease_locked(struct cached_fid *cfid) +{ + if (cfid->has_lease) { + cfid->has_lease = false; + kref_put(&cfid->refcount, smb2_close_cached_fid); + } +} + +void close_shroot_lease(struct cached_fid *cfid) +{ + mutex_lock(&cfid->fid_mutex); + close_shroot_lease_locked(cfid); + mutex_unlock(&cfid->fid_mutex); +} + void smb2_cached_lease_break(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot(cfid); + close_shroot_lease(cfid); } /* @@ -773,6 +789,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); + tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0ab6b1200288..9434f6dd8df3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1847,7 +1847,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot(&tcon->crfid); + close_shroot_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a18272c987fe..27d29f2eb6c8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,6 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid); extern void close_shroot(struct cached_fid *cfid); +extern void close_shroot_lease(struct cached_fid *cfid); +extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 040df1f5e1c8..40cca351273f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -151,7 +151,7 @@ static struct key *search_fscrypt_keyring(struct key *keyring, } #define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id)) + (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) #define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a13a78725c57..b766c3ee5fa8 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -649,6 +649,8 @@ ssize_t erofs_listxattr(struct dentry *dentry, struct listxattr_iter it; ret = init_inode_xattrs(d_inode(dentry)); + if (ret == -ENOATTR) + return 0; if (ret) return ret; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index d4d4fdfac1a6..1ee04e76bbe0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -133,10 +133,13 @@ static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; + struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks->root); + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -144,6 +147,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) first = 0; node = rb_next(node); } + rcu_read_unlock(); printk(KERN_CONT "\n"); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9fdd2b269d61..9f00fc0bf21d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -72,6 +72,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); + const int next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; @@ -79,8 +80,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - buf) + rlen > size)) + else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + next_offset != size)) + error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index dc333e8e51e8..8ca4a23129aa 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -921,8 +921,8 @@ repeat_in_this_group: if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, - 0, 0); + handle_type, nblocks, 0, + ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 92a9da1774aa..bbce1c328d85 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -25,7 +25,7 @@ * For constructing the negative timestamp lower bound value. * binary: 10000000 00000000 00000000 00000000 */ -#define LOWER_MSB_1 (-0x80000000L) +#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */ /* * For constructing the negative timestamp upper bound value. * binary: 11111111 11111111 11111111 11111111 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f28de0c1b6..629a25d999f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5692,7 +5692,7 @@ int ext4_expand_extra_isize(struct inode *inode, error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); - goto out_stop; + goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, @@ -5702,8 +5702,8 @@ int ext4_expand_extra_isize(struct inode *inode, if (!error) error = rc; +out_unlock: ext4_write_unlock_xattr(inode, &no_expand); -out_stop: ext4_journal_stop(handle); return error; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a856997d87b5..1cb42d940784 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2164,7 +2164,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; +#ifdef CONFIG_UNICODE struct ext4_sb_info *sbi; +#endif struct ext4_filename fname; int retval; int dx_fallback=0; @@ -2176,12 +2178,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; - sbi = EXT4_SB(sb); blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; #ifdef CONFIG_UNICODE + sbi = EXT4_SB(sb); if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) return -EINVAL; @@ -2822,7 +2824,7 @@ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; - struct ext4_dir_entry_2 *de, *de1; + struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { @@ -2847,19 +2849,25 @@ bool ext4_empty_dir(struct inode *inode) return true; de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de, sb->s_blocksize); - if (le32_to_cpu(de->inode) != inode->i_ino || - le32_to_cpu(de1->inode) == 0 || - strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning_inode(inode, "directory missing '.' and/or '..'"); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + 0) || + le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + ext4_warning_inode(inode, "directory missing '.'"); + brelse(bh); + return true; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return true; } - offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + - ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); - de = ext4_next_entry(de1, sb->s_blocksize); + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { - if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -2870,12 +2878,11 @@ bool ext4_empty_dir(struct inode *inode) } if (IS_ERR(bh)) return true; - de = (struct ext4_dir_entry_2 *) bh->b_data; } + de = (struct ext4_dir_entry_2 *) (bh->b_data + + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset)) { - de = (struct ext4_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; continue; } @@ -2884,7 +2891,6 @@ bool ext4_empty_dir(struct inode *inode) return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); return true; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1d82b56d9b11..2937a8873fe1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1900,6 +1900,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { + if ((arg & 1) || + (arg < 4) || + (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { + ext4_msg(sb, KERN_ERR, + "Invalid want_extra_isize %d", arg); + return -1; + } sbi->s_want_extra_isize = arg; } else if (token == Opt_max_batch_time) { sbi->s_max_batch_time = arg; @@ -3554,40 +3561,6 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } -static void ext4_clamp_want_extra_isize(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - unsigned def_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - - if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { - sbi->s_want_extra_isize = 0; - return; - } - if (sbi->s_want_extra_isize < 4) { - sbi->s_want_extra_isize = def_extra_isize; - if (ext4_has_feature_extra_isize(sb)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if ((sbi->s_want_extra_isize > sbi->s_inode_size) || - (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size)) { - sbi->s_want_extra_isize = def_extra_isize; - ext4_msg(sb, KERN_INFO, - "required extra inode space not available"); - } -} - static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; @@ -3795,6 +3768,68 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + goto failed_mount; + } + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + /* + * i_atime_extra is the last extra field available for + * [acm]times in struct ext4_inode. Checking for that + * field should suffice to ensure we have extra space + * for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + sb->s_time_min = EXT4_TIMESTAMP_MIN; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + unsigned v, max = (sbi->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE); + + v = le16_to_cpu(es->s_want_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_want_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + + v = le16_to_cpu(es->s_min_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_min_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + } + } + if (sbi->s_es->s_mount_opts[0]) { char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), @@ -4033,42 +4068,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { - sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { - ext4_msg(sb, KERN_ERR, "invalid first ino: %u", - sbi->s_first_ino); - goto failed_mount; - } - if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext4_msg(sb, KERN_ERR, - "unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - /* - * i_atime_extra is the last extra field available for [acm]times in - * struct ext4_inode. Checking for that field should suffice to ensure - * we have extra space for all three. - */ - if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + - sizeof(((struct ext4_inode *)0)->i_atime_extra)) { - sb->s_time_gran = 1; - sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; - } else { - sb->s_time_gran = NSEC_PER_SEC; - sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; - } - - sb->s_time_min = EXT4_TIMESTAMP_MIN; - } - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || @@ -4517,8 +4516,6 @@ no_journal: } else if (ret) goto failed_mount4a; - ext4_clamp_want_extra_isize(sb); - ext4_set_resv_clusters(sb); err = ext4_setup_system_zone(sb); @@ -5306,8 +5303,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - ext4_clamp_want_extra_isize(sb); - if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " diff --git a/fs/file.c b/fs/file.c index 3da91a112bab..2f4fcf985079 100644 --- a/fs/file.c +++ b/fs/file.c @@ -960,7 +960,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) return ksys_dup3(oldfd, newfd, 0); } -int ksys_dup(unsigned int fildes) +SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); @@ -975,11 +975,6 @@ int ksys_dup(unsigned int fildes) return ret; } -SYSCALL_DEFINE1(dup, unsigned int, fildes) -{ - return ksys_dup(fildes); -} - int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; diff --git a/fs/io-wq.c b/fs/io-wq.c index 74b40506c5d9..11e80b7252a8 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -49,7 +49,6 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - wait_queue_head_t wait; struct io_wqe *wqe; struct io_wq_work *cur_work; @@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) worker = hlist_nulls_entry(n, struct io_worker, nulls_node); if (io_worker_get(worker)) { - wake_up(&worker->wait); + wake_up_process(worker->task); io_worker_release(worker); return true; } @@ -492,28 +491,46 @@ next: } while (1); } +static inline void io_worker_spin_for_work(struct io_wqe *wqe) +{ + int i = 0; + + while (++i < 1000) { + if (io_wqe_run_queue(wqe)) + break; + if (need_resched()) + break; + cpu_relax(); + } +} + static int io_wqe_worker(void *data) { struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - DEFINE_WAIT(wait); + bool did_work; io_worker_start(wqe, worker); + did_work = false; while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE); - + set_current_state(TASK_INTERRUPTIBLE); +loop: + if (did_work) + io_worker_spin_for_work(wqe); spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); - continue; + did_work = true; + goto loop; } + did_work = false; /* drops the lock on success, retry */ if (__io_worker_idle(wqe, worker)) { __release(&wqe->lock); - continue; + goto loop; } spin_unlock_irq(&wqe->lock); if (signal_pending(current)) @@ -526,8 +543,6 @@ static int io_wqe_worker(void *data) break; } - finish_wait(&worker->wait, &wait); - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) @@ -589,7 +604,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; - init_waitqueue_head(&worker->wait); worker->wqe = wqe; spin_lock_init(&worker->lock); @@ -934,7 +948,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, /* * Now check if a free (going busy) or busy worker has the work * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempte to signal cancellation. The + * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. */ rcu_read_lock(); diff --git a/fs/io-wq.h b/fs/io-wq.h index 7c333a28e2a7..3f5e356de980 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { if (!list->first) { - list->first = list->last = node; + list->last = node; + WRITE_ONCE(list->first, node); } else { list->last->next = node; list->last = node; @@ -47,7 +48,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, struct io_wq_work_node *prev) { if (node == list->first) - list->first = node->next; + WRITE_ONCE(list->first, node->next); if (node == list->last) list->last = prev; if (prev) @@ -58,7 +59,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -#define wq_list_empty(list) ((list)->first == NULL) +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ (list)->first = NULL; \ (list)->last = NULL; \ @@ -119,6 +120,10 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk) static inline void io_wq_worker_running(struct task_struct *tsk) { } -#endif /* CONFIG_IO_WQ */ +#endif -#endif /* INTERNAL_IO_WQ_H */ +static inline bool io_wq_current_is_worker(void) +{ + return in_task() && (current->flags & PF_IO_WORKER); +} +#endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 405be10da73d..6f084e3cf835 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -289,11 +289,14 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - struct wait_queue_head *head; + union { + struct wait_queue_head *head; + u64 addr; + }; __poll_t events; bool done; bool canceled; - struct wait_queue_entry *wait; + struct wait_queue_entry wait; }; struct io_timeout_data { @@ -304,6 +307,31 @@ struct io_timeout_data { u32 seq_offset; }; +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; +}; + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; +}; + +struct io_cancel { + struct file *file; + u64 addr; +}; + +struct io_timeout { + struct file *file; + u64 addr; + int flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -343,6 +371,10 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_accept accept; + struct io_sync sync; + struct io_cancel cancel; + struct io_timeout timeout; }; const struct io_uring_sqe *sqe; @@ -352,6 +384,7 @@ struct io_kiocb { bool has_user; bool in_async; bool needs_fixed_file; + u8 opcode; struct io_ring_ctx *ctx; union { @@ -377,6 +410,8 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ +#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ +#define REQ_F_PREPPED 131072 /* request already opcode prepared */ u64 user_data; u32 result; u32 sequence; @@ -563,12 +598,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +static inline bool io_req_needs_user(struct io_kiocb *req) { - u8 opcode = READ_ONCE(sqe->opcode); - - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); + return !(req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED); } static inline bool io_prep_async_work(struct io_kiocb *req, @@ -577,10 +610,12 @@ static inline bool io_prep_async_work(struct io_kiocb *req, bool do_hashed = false; if (req->sqe) { - switch (req->sqe->opcode) { + switch (req->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - do_hashed = true; + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; /* fall-through */ case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -598,7 +633,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; break; } - if (io_sqe_needs_user(req->sqe)) + if (io_req_needs_user(req)) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } @@ -969,7 +1004,7 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); @@ -1175,7 +1210,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } /* - * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a * non-spinning poll check - we'll still enter the driver poll loop, but only * as a non-spinning completion check. */ @@ -1292,6 +1327,12 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); @@ -1299,8 +1340,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); io_cqring_add_event(req, res); } @@ -1330,8 +1371,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; @@ -1422,7 +1463,7 @@ static bool io_file_supports_async(struct file *file) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode)) + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; @@ -1606,7 +1647,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, * for that purpose and instead let the caller pass in the read/write * flag. */ - opcode = READ_ONCE(sqe->opcode); + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; return io_import_fixed(req->ctx, rw, sqe, iter); @@ -1692,7 +1733,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, +static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { @@ -1706,19 +1747,39 @@ static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, } } -static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static int io_alloc_async_ctx(struct io_kiocb *req) { req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); if (req->io) { - io_req_map_io(req, io_size, iovec, fast_iov, iter); memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); req->sqe = &req->io->sqe; return 0; } - return -ENOMEM; + return 1; +} + +static void io_rw_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; + + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->rw.iov; + io_wq_submit_work(workptr); + kfree(iov); +} + +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; + + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + req->work.func = io_rw_async; + return 0; } static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, @@ -1756,6 +1817,10 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, return ret; } + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.ki_flags &= ~IOCB_NOWAIT; + file = req->file; io_size = ret; if (req->flags & REQ_F_LINK) @@ -1797,7 +1862,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, kiocb_done(kiocb, ret2, nxt, req->in_async); } else { copy_iov: - ret = io_setup_async_io(req, io_size, iovec, + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; @@ -1805,7 +1870,8 @@ copy_iov: } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } @@ -1844,6 +1910,10 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, return ret; } + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.ki_flags &= ~IOCB_NOWAIT; + file = kiocb->ki_filp; io_size = ret; if (req->flags & REQ_F_LINK) @@ -1858,7 +1928,9 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, goto copy_iov; } - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) goto copy_iov; iov_count = iov_iter_count(&iter); @@ -1889,7 +1961,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, kiocb_done(kiocb, ret2, nxt, req->in_async); } else { copy_iov: - ret = io_setup_async_io(req, io_size, iovec, + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; @@ -1897,7 +1969,8 @@ copy_iov: } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } @@ -1916,10 +1989,13 @@ static int io_nop(struct io_kiocb *req) return 0; } -static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_fsync(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_PREPPED) + return 0; if (!req->file) return -EBADF; @@ -1928,46 +2004,80 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; + req->sync.flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->flags |= REQ_F_PREPPED; return 0; } -static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static bool io_req_cancelled(struct io_kiocb *req) { - loff_t sqe_off = READ_ONCE(sqe->off); - loff_t sqe_len = READ_ONCE(sqe->len); - loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; + if (req->work.flags & IO_WQ_WORK_CANCEL) { + req_set_fail_links(req); + io_cqring_add_event(req, -ECANCELED); + io_put_req(req); + return true; + } + + return false; +} + +static void io_fsync_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + loff_t end = req->sync.off + req->sync.len; + struct io_kiocb *nxt = NULL; int ret; - fsync_flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; + if (io_req_cancelled(req)) + return; + + ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, + end > 0 ? end : LLONG_MAX, + req->sync.flags & IORING_FSYNC_DATASYNC); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; +} + +static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + int ret; - ret = io_prep_fsync(req, sqe); + ret = io_prep_fsync(req); if (ret) return ret; /* fsync always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fsync_finish; return -EAGAIN; + } - ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, - end > 0 ? end : LLONG_MAX, - fsync_flags & IORING_FSYNC_DATASYNC); - - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + work = old_work = &req->work; + io_fsync_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); return 0; } -static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_sfr(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - int ret = 0; + if (req->flags & REQ_F_PREPPED) + return 0; if (!req->file) return -EBADF; @@ -1976,39 +2086,68 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - return ret; + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->sync.flags = READ_ONCE(sqe->sync_range_flags); + req->flags |= REQ_F_PREPPED; + return 0; } -static int io_sync_file_range(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, +static void io_sync_file_range_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + if (io_req_cancelled(req)) + return; + + ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, + req->sync.flags); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; +} + +static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - loff_t sqe_off; - loff_t sqe_len; - unsigned flags; + struct io_wq_work *work, *old_work; int ret; - ret = io_prep_sfr(req, sqe); + ret = io_prep_sfr(req); if (ret) return ret; /* sync_file_range always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_sync_file_range_finish; return -EAGAIN; + } - sqe_off = READ_ONCE(sqe->off); - sqe_len = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->sync_range_flags); + work = old_work = &req->work; + io_sync_file_range_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; +} - ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); +#if defined(CONFIG_NET) +static void io_sendrecv_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); - return 0; + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->msg.iov; + io_wq_submit_work(workptr); + kfree(iov); } +#endif static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { @@ -2019,16 +2158,19 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) flags = READ_ONCE(sqe->msg_flags); msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); #else return 0; #endif } -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2037,9 +2179,8 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct io_async_ctx io, *copy; + struct io_async_ctx io; struct sockaddr_storage addr; - struct msghdr *kmsg; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -2049,38 +2190,40 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, flags |= MSG_DONTWAIT; if (req->io) { - kmsg = &req->io->msg.msg; - kmsg->msg_name = &addr; + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg.msg; - kmsg->msg_name = &addr; - io.msg.iov = io.msg.fast_iov; + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; ret = io_sendmsg_prep(req, &io); if (ret) goto out; } - ret = __sys_sendmsg_sock(sock, kmsg, flags); + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (force_nonblock && ret == -EAGAIN) { - copy = kmalloc(sizeof(*copy), GFP_KERNEL); - if (!copy) { - ret = -ENOMEM; - goto out; - } - memcpy(©->msg, &io.msg, sizeof(copy->msg)); - req->io = copy; - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &req->io->sqe; - return ret; + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; #else @@ -2097,6 +2240,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) flags = READ_ONCE(sqe->msg_flags); msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, &io->msg.iov); #else @@ -2104,10 +2248,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) #endif } -static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2117,9 +2263,8 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { struct user_msghdr __user *msg; - struct io_async_ctx io, *copy; + struct io_async_ctx io; struct sockaddr_storage addr; - struct msghdr *kmsg; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -2131,38 +2276,40 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, msg = (struct user_msghdr __user *) (unsigned long) READ_ONCE(sqe->addr); if (req->io) { - kmsg = &req->io->msg.msg; - kmsg->msg_name = &addr; + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg.msg; - kmsg->msg_name = &addr; - io.msg.iov = io.msg.fast_iov; + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; ret = io_recvmsg_prep(req, &io); if (ret) goto out; } - ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { - copy = kmalloc(sizeof(*copy), GFP_KERNEL); - if (!copy) { - ret = -ENOMEM; - goto out; - } - memcpy(copy, &io, sizeof(*copy)); - req->io = copy; - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &req->io->sqe; - return ret; + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; #else @@ -2170,37 +2317,84 @@ out: #endif } -static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_accept_prep(struct io_kiocb *req) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; - int __user *addr_len; - unsigned file_flags; - int flags, ret; + const struct io_uring_sqe *sqe = req->sqe; + struct io_accept *accept = &req->accept; + + if (req->flags & REQ_F_PREPPED) + return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); - flags = READ_ONCE(sqe->accept_flags); - file_flags = force_nonblock ? O_NONBLOCK : 0; + accept->addr = (struct sockaddr __user *) + (unsigned long) READ_ONCE(sqe->addr); + accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + accept->flags = READ_ONCE(sqe->accept_flags); + req->flags |= REQ_F_PREPPED; + return 0; +#else + return -EOPNOTSUPP; +#endif +} - ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); - if (ret == -EAGAIN && force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; +#if defined(CONFIG_NET) +static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_accept *accept = &req->accept; + unsigned file_flags; + int ret; + + file_flags = force_nonblock ? O_NONBLOCK : 0; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, + accept->addr_len, accept->flags); + if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; +} + +static void io_accept_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_accept(req, &nxt, false); + if (nxt) + *workptr = &nxt->work; +} +#endif + +static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + int ret; + + ret = io_accept_prep(req); + if (ret) + return ret; + + ret = __io_accept(req, nxt, force_nonblock); + if (ret == -EAGAIN && force_nonblock) { + req->work.func = io_accept_finish; + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + io_put_req(req); + return -EAGAIN; + } + return 0; #else return -EOPNOTSUPP; #endif @@ -2221,10 +2415,11 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) #endif } -static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; struct io_async_ctx __io, *io; unsigned file_flags; int addr_len, ret; @@ -2249,22 +2444,20 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = __sys_connect_file(req->file, &io->connect.address, addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { ret = -ENOMEM; goto out; } - memcpy(&io->connect, &__io.connect, sizeof(io->connect)); - req->io = io; - memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &io->sqe; + memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; out: - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2279,8 +2472,8 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait->entry)) { - list_del_init(&poll->wait->entry); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); @@ -2320,28 +2513,45 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } +static int io_poll_remove_prep(struct io_kiocb *req) +{ + const struct io_uring_sqe *sqe = req->sqe; + + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + req->poll.addr = READ_ONCE(sqe->addr); + req->flags |= REQ_F_PREPPED; + return 0; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. */ -static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + u64 addr; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) - return -EINVAL; + ret = io_poll_remove_prep(req); + if (ret) + return ret; + addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2351,7 +2561,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - kfree(req->poll.wait); if (error) io_cqring_fill_event(req, error); else @@ -2389,7 +2598,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) */ spin_lock_irq(&ctx->completion_lock); if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, poll->wait); + add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } @@ -2399,8 +2608,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2419,7 +2628,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); /* * Run completion inline if we can. We're using trylock here because @@ -2460,7 +2669,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, pt->req->poll.wait); + add_wait_queue(head, &pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) @@ -2472,16 +2681,14 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_poll_add_prep(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; - struct io_poll_table ipt; - bool cancel = false; - __poll_t mask; u16 events; + if (req->flags & REQ_F_PREPPED) + return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) @@ -2489,14 +2696,26 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); - if (!poll->wait) - return -ENOMEM; - - req->io = NULL; - INIT_IO_WORK(&req->work, io_poll_complete_work); + req->flags |= REQ_F_PREPPED; events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + return 0; +} + +static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + bool cancel = false; + __poll_t mask; + int ret; + + ret = io_poll_add_prep(req); + if (ret) + return ret; + + INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; @@ -2509,9 +2728,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait->entry); - init_waitqueue_func_entry(poll->wait, io_poll_wake); - poll->wait->private = poll; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + poll->wait.private = poll; INIT_LIST_HEAD(&req->list); @@ -2520,14 +2739,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait->entry))) { + if (unlikely(list_empty(&poll->wait.entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2567,7 +2786,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) /* * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the the cq_tail + * will consume a slot in the cq_ring and the cq_tail * pointer will be increased, otherwise other timeout reqs may * return in advance without waiting for enough wait_nr. */ @@ -2582,8 +2801,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -2608,40 +2826,53 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -1) return -EALREADY; - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; } +static int io_timeout_remove_prep(struct io_kiocb *req) +{ + const struct io_uring_sqe *sqe = req->sqe; + + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + + req->timeout.addr = READ_ONCE(sqe->addr); + req->timeout.flags = READ_ONCE(sqe->timeout_flags); + if (req->timeout.flags) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return 0; +} + /* * Remove or update an existing timeout command */ -static int io_timeout_remove(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_timeout_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; int ret; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags) - return -EINVAL; + ret = io_timeout_remove_prep(req); + if (ret) + return ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_timeout_cancel(ctx, req->timeout.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2676,31 +2907,25 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - req->io = io; return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; - struct io_async_ctx *io; struct list_head *entry; unsigned span = 0; + int ret; - io = req->io; - if (!io) { - int ret; - - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) + if (!req->io) { + if (io_alloc_async_ctx(req)) return -ENOMEM; - ret = io_timeout_prep(req, io, false); - if (ret) { - kfree(io); + ret = io_timeout_prep(req, req->io, false); + if (ret) return ret; - } } data = &req->io->timeout; @@ -2822,43 +3047,84 @@ done: spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); } -static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_async_cancel_prep(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + const struct io_uring_sqe *sqe = req->sqe; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + req->flags |= REQ_F_PREPPED; + req->cancel.addr = READ_ONCE(sqe->addr); + return 0; +} + +static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_async_cancel_prep(req); + if (ret) + return ret; + + io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); return 0; } -static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_req_defer_prep(struct io_kiocb *req) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_async_ctx *io = req->io; struct iov_iter iter; - ssize_t ret; - - memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); - req->sqe = &io->sqe; + ssize_t ret = 0; - switch (io->sqe.opcode) { + switch (req->opcode) { + case IORING_OP_NOP: + break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + /* ensure prep does right import */ + req->io = NULL; ret = io_read_prep(req, &iovec, &iter, true); + req->io = io; + if (ret < 0) + break; + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); + ret = 0; break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + /* ensure prep does right import */ + req->io = NULL; ret = io_write_prep(req, &iovec, &iter, true); + req->io = io; + if (ret < 0) + break; + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); + ret = 0; + break; + case IORING_OP_POLL_ADD: + ret = io_poll_add_prep(req); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove_prep(req); + break; + case IORING_OP_FSYNC: + ret = io_prep_fsync(req); + break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_prep_sfr(req); break; case IORING_OP_SENDMSG: ret = io_sendmsg_prep(req, io); @@ -2870,41 +3136,45 @@ static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) ret = io_connect_prep(req, io); break; case IORING_OP_TIMEOUT: - return io_timeout_prep(req, io, false); + ret = io_timeout_prep(req, io, false); + break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove_prep(req); + break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel_prep(req); + break; case IORING_OP_LINK_TIMEOUT: - return io_timeout_prep(req, io, true); + ret = io_timeout_prep(req, io, true); + break; + case IORING_OP_ACCEPT: + ret = io_accept_prep(req); + break; default: - req->io = io; - return 0; + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + ret = -EINVAL; + break; } - if (ret < 0) - return ret; - - req->io = io; - io_req_map_io(req, ret, iovec, inline_vecs, &iter); - return 0; + return ret; } static int io_req_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_async_ctx *io; int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) + if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, io); - if (ret < 0) { - kfree(io); + ret = io_req_defer_prep(req); + if (ret < 0) return ret; - } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -2922,11 +3192,10 @@ __attribute__((nonnull)) static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - int ret, opcode; struct io_ring_ctx *ctx = req->ctx; + int ret; - opcode = READ_ONCE(req->sqe->opcode); - switch (opcode) { + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req); break; @@ -2947,37 +3216,37 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, req->sqe, nxt, force_nonblock); + ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, req->sqe, nxt); + ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, req->sqe); + ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); + ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); + ret = io_sendmsg(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); + ret = io_recvmsg(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, req->sqe); + ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, req->sqe); + ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - ret = io_accept(req, req->sqe, nxt, force_nonblock); + ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: - ret = io_connect(req, req->sqe, nxt, force_nonblock); + ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, req->sqe, nxt); + ret = io_async_cancel(req, nxt); break; default: ret = -EINVAL; @@ -2991,12 +3260,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, if (req->result == -EAGAIN) return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ - if (req->in_async) - mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (req->in_async) - mutex_unlock(&ctx->uring_lock); } return 0; @@ -3018,9 +3282,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; - if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -3044,8 +3305,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); if (ret) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } @@ -3064,20 +3324,25 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } } -static bool io_op_needs_file(const struct io_uring_sqe *sqe) +static bool io_req_op_valid(int op) { - int op = READ_ONCE(sqe->opcode); + return op >= IORING_OP_NOP && op < IORING_OP_LAST; +} - switch (op) { +static int io_req_needs_file(struct io_kiocb *req) +{ + switch (req->opcode) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: case IORING_OP_TIMEOUT: case IORING_OP_TIMEOUT_REMOVE: case IORING_OP_ASYNC_CANCEL: case IORING_OP_LINK_TIMEOUT: - return false; + return 0; default: - return true; + if (io_req_op_valid(req->opcode)) + return 1; + return -EINVAL; } } @@ -3094,7 +3359,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd; + int fd, ret; flags = READ_ONCE(req->sqe->flags); fd = READ_ONCE(req->sqe->fd); @@ -3102,8 +3367,9 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - if (!io_op_needs_file(req->sqe)) - return 0; + ret = io_req_needs_file(req); + if (ret <= 0) + return ret; if (flags & IOSQE_FIXED_FILE) { if (unlikely(!ctx->file_table || @@ -3179,8 +3445,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - if (prev->flags & REQ_F_LINK) - prev->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, NULL, -ETIME); io_put_req(prev); @@ -3222,7 +3487,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, link_list); - if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; req->flags |= REQ_F_LINK_TIMEOUT; @@ -3231,13 +3496,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; int ret; +again: + linked_timeout = io_prep_linked_timeout(req); + ret = io_issue_sqe(req, &nxt, true); - if (nxt) - io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -3256,7 +3522,7 @@ static void __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - return; + goto done_req; } err: @@ -3273,10 +3539,15 @@ err: /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); } +done_req: + if (nxt) { + req = nxt; + nxt = NULL; + goto again; + } } static void io_queue_sqe(struct io_kiocb *req) @@ -3293,8 +3564,7 @@ static void io_queue_sqe(struct io_kiocb *req) if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_double_put_req(req); } } else @@ -3310,8 +3580,8 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_queue_sqe(req); } - -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK) static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) @@ -3319,8 +3589,6 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = req->sqe->user_data; - /* enforce forwards compatibility on users */ if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; @@ -3344,27 +3612,30 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; - struct io_async_ctx *io; if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) { + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; + + if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; } - ret = io_req_defer_prep(req, io); + ret = io_req_defer_prep(req); if (ret) { - kfree(io); + /* fail even hard links since we don't submit */ prev->flags |= REQ_F_FAIL_LINK; goto err_req; } trace_io_uring_link(ctx, req, prev); list_add_tail(&req->link_list, &prev->link_list); - } else if (req->sqe->flags & IOSQE_IO_LINK) { + } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); *link = req; @@ -3414,7 +3685,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * Fetch an sqe, if one is available. Note that req->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later @@ -3449,6 +3720,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) */ req->sequence = ctx->cached_sq_head; req->sqe = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE(req->sqe->opcode); + req->user_data = READ_ONCE(req->sqe->user_data); ctx->cached_sq_head++; return true; } @@ -3494,7 +3767,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - if (io_sqe_needs_user(req->sqe) && !*mm) { + if (io_req_needs_user(req) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3510,15 +3783,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->sqe->user_data, - true, async); + trace_io_uring_submit_sqe(ctx, req->user_data, true, async); if (!io_submit_sqe(req, statep, &link)) break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ - if (!(sqe_flags & IOSQE_IO_LINK) && link) { + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { io_queue_link_head(link); link = NULL; } @@ -3647,7 +3919,9 @@ static int io_sq_thread(void *data) } to_submit = min(to_submit, ctx->sq_entries); + mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + mutex_unlock(&ctx->uring_lock); if (ret > 0) inflight += ret; } @@ -3676,7 +3950,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) struct io_ring_ctx *ctx = iowq->ctx; /* - * Wake up if we have enough events, or if a timeout occured since we + * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ @@ -4866,6 +5140,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, submitted = io_submit_sqes(ctx, to_submit, f.file, fd, &cur_mm, false); mutex_unlock(&ctx->uring_lock); + + if (submitted != to_submit) + goto out; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; @@ -4879,6 +5156,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } +out: percpu_ref_put(&ctx->refs); out_fput: fdput(f); diff --git a/fs/namespace.c b/fs/namespace.c index 2fd0c8bcb8c1..be601d3a8008 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3325,8 +3325,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) } EXPORT_SYMBOL(mount_subtree); -int ksys_mount(const char __user *dev_name, const char __user *dir_name, - const char __user *type, unsigned long flags, void __user *data) +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; @@ -3359,12 +3359,6 @@ out_type: return ret; } -SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, - char __user *, type, unsigned long, flags, void __user *, data) -{ - return ksys_mount(dev_name, dir_name, type, flags, data); -} - /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b801c6353100..6220642fe113 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -227,13 +227,17 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; - int fh_type, fh_len, dwords; - void *buf; + int fh_type, dwords; int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &real->d_sb->s_uuid; + int err; - buf = kmalloc(buflen, GFP_KERNEL); - if (!buf) + /* Make sure the real fid stays 32bit aligned */ + BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4); + BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255); + + fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL); + if (!fh) return ERR_PTR(-ENOMEM); /* @@ -242,27 +246,19 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, buf, &dwords, 0); + fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); buflen = (dwords << 2); - fh = ERR_PTR(-EIO); + err = -EIO; if (WARN_ON(fh_type < 0) || WARN_ON(buflen > MAX_HANDLE_SZ) || WARN_ON(fh_type == FILEID_INVALID)) - goto out; + goto out_err; - BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); - fh_len = offsetof(struct ovl_fh, fid) + buflen; - fh = kmalloc(fh_len, GFP_KERNEL); - if (!fh) { - fh = ERR_PTR(-ENOMEM); - goto out; - } - - fh->version = OVL_FH_VERSION; - fh->magic = OVL_FH_MAGIC; - fh->type = fh_type; - fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->fb.version = OVL_FH_VERSION; + fh->fb.magic = OVL_FH_MAGIC; + fh->fb.type = fh_type; + fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN; /* * When we will want to decode an overlay dentry from this handle * and all layers are on the same fs, if we get a disconncted real @@ -270,14 +266,15 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * it to upperdentry or to lowerstack is by checking this flag. */ if (is_upper) - fh->flags |= OVL_FH_FLAG_PATH_UPPER; - fh->len = fh_len; - fh->uuid = *uuid; - memcpy(fh->fid, buf, buflen); + fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; + fh->fb.len = sizeof(fh->fb) + buflen; + fh->fb.uuid = *uuid; -out: - kfree(buf); return fh; + +out_err: + kfree(fh); + return ERR_PTR(err); } int ovl_set_origin(struct dentry *dentry, struct dentry *lower, @@ -300,8 +297,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, - fh ? fh->len : 0, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + fh ? fh->fb.len : 0, 0); kfree(fh); return err; @@ -317,7 +314,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0); kfree(fh); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 702aa63f6774..29abdb1d3b5c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (newdentry == trap) goto out_dput; - if (WARN_ON(olddentry->d_inode == newdentry->d_inode)) + if (olddentry->d_inode == newdentry->d_inode) goto out_dput; err = 0; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 73c9775215b3..70e55588aedc 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -211,10 +211,11 @@ static int ovl_check_encode_origin(struct dentry *dentry) return 1; } -static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; + int len; /* * Check if we should encode a lower or upper file handle and maybe @@ -231,11 +232,12 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) return PTR_ERR(fh); err = -EOVERFLOW; - if (fh->len > buflen) + len = OVL_FH_LEN(fh); + if (len > buflen) goto fail; - memcpy(buf, (char *)fh, fh->len); - err = fh->len; + memcpy(fid, fh, len); + err = len; out: kfree(fh); @@ -243,31 +245,16 @@ out: fail: pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", - dentry, err, buflen, fh ? (int)fh->len : 0, - fh ? fh->type : 0); + dentry, err, buflen, fh ? (int)fh->fb.len : 0, + fh ? fh->fb.type : 0); goto out; } -static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) -{ - int res, len = *max_len << 2; - - res = ovl_d_to_fh(dentry, (char *)fid, len); - if (res <= 0) - return FILEID_INVALID; - - len = res; - - /* Round up to dwords */ - *max_len = (len + 3) >> 2; - return OVL_FILEID; -} - static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct dentry *dentry; - int type; + int bytes = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) @@ -277,10 +264,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - type = ovl_dentry_to_fh(dentry, fid, max_len); - + bytes = ovl_dentry_to_fid(dentry, fid, bytes); dput(dentry); - return type; + if (bytes <= 0) + return FILEID_INVALID; + + *max_len = bytes >> 2; + + return OVL_FILEID_V1; } /* @@ -777,24 +768,45 @@ out_err: goto out; } +static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) +{ + struct ovl_fh *fh; + + /* If on-wire inner fid is aligned - nothing to do */ + if (fh_type == OVL_FILEID_V1) + return (struct ovl_fh *)fid; + + if (fh_type != OVL_FILEID_V0) + return ERR_PTR(-EINVAL); + + fh = kzalloc(buflen, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* Copy unaligned inner fh into aligned buffer */ + memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + return fh; +} + static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; - struct ovl_fh *fh = (struct ovl_fh *) fid; + struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; - err = -EINVAL; - if (fh_type != OVL_FILEID) + fh = ovl_fid_to_fh(fid, len, fh_type); + err = PTR_ERR(fh); + if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; - flags = fh->flags; + flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); @@ -802,12 +814,18 @@ static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, if (IS_ERR(dentry) && err != -ESTALE) goto out_err; +out: + /* We may have needed to re-align OVL_FILEID_V0 */ + if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) + kfree(fh); + return dentry; out_err: pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", - len, fh_type, flags, err); - return ERR_PTR(err); + fh_len, fh_type, flags, err); + dentry = ERR_PTR(err); + goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bc14781886bf..b045cf1826fc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -200,8 +200,14 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { - stat->ino = lowerstat.ino; lower_layer = ovl_layer_lower(dentry); + /* + * Cannot use origin st_dev;st_ino because + * origin inode content may differ from overlay + * inode content. + */ + if (samefs || lower_layer->fsid) + stat->ino = lowerstat.ino; } /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c269d6033525..76ff66339173 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -84,21 +84,21 @@ static int ovl_acceptable(void *ctx, struct dentry *dentry) * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { - if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; - if (fh->magic != OVL_FH_MAGIC) + if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ - if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ - if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && - (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; @@ -119,15 +119,15 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) if (res == 0) return NULL; - fh = kzalloc(res, GFP_KERNEL); + fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, name, fh, res); + res = vfs_getxattr(dentry, name, fh->buf, res); if (res < 0) goto fail; - err = ovl_check_fh_len(fh, res); + err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; @@ -158,12 +158,12 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ - if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid)) return NULL; - bytes = (fh->len - offsetof(struct ovl_fh, fid)); - real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, - bytes >> 2, (int)fh->type, + bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, + bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* @@ -173,7 +173,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && - !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } @@ -323,6 +323,14 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int i; for (i = 0; i < ofs->numlower; i++) { + /* + * If lower fs uuid is not unique among lower fs we cannot match + * fh->uuid to layer. + */ + if (ofs->lower_layers[i].fsid && + ofs->lower_layers[i].fs->bad_uuid) + continue; + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, connected); if (origin) @@ -400,7 +408,7 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name, if (IS_ERR(ofh)) return PTR_ERR(ofh); - if (fh->len != ofh->len || memcmp(fh, ofh, fh->len)) + if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); @@ -431,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); + err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0); if (err) goto fail; @@ -505,20 +513,20 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto fail; err = -EINVAL; - if (index->d_name.len < sizeof(struct ovl_fh)*2) + if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_KERNEL); + fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; - if (hex2bin((u8 *)fh, index->d_name.name, len)) + if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; - err = ovl_check_fh_len(fh, len); + err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; @@ -597,11 +605,11 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) { char *n, *s; - n = kcalloc(fh->len, 2, GFP_KERNEL); + n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; - s = bin2hex(n, fh, fh->len); + s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0..f283b1d69a9e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -71,20 +71,36 @@ enum ovl_entry_flag { #error Endianness not defined #endif -/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ -#define OVL_FILEID 0xfb +/* The type used to be returned by overlay exportfs for misaligned fid */ +#define OVL_FILEID_V0 0xfb +/* The type returned by overlay exportfs for 32bit aligned fid */ +#define OVL_FILEID_V1 0xf8 -/* On-disk and in-memeory format for redirect by file handle */ -struct ovl_fh { +/* On-disk format for "origin" file handle */ +struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u8 fid[0]; /* file identifier */ + u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ } __packed; +/* In-memory and on-wire format for overlay file handle */ +struct ovl_fh { + u8 padding[3]; /* make sure fb.fid is 32bit aligned */ + union { + struct ovl_fb fb; + u8 buf[0]; + }; +} __packed; + +#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) +#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) +#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ + offsetof(struct ovl_fb, fid)) + static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); @@ -302,7 +318,13 @@ static inline void ovl_inode_unlock(struct inode *inode) /* namei.c */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); + +static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); +} + struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index a8279280e88d..28348c44ea5b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -22,6 +22,8 @@ struct ovl_config { struct ovl_sb { struct super_block *sb; dev_t pseudo_dev; + /* Unusable (conflicting) uuid */ + bool bad_uuid; }; struct ovl_layer { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index afbcb116a7f1..7621ff176d15 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1255,7 +1255,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; - if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; for (i = 0; i < ofs->numlowerfs; i++) { @@ -1263,9 +1263,13 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long * as all lower layers with null uuid are on the same fs. + * if we detect multiple lower fs with the same uuid, we + * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) { + ofs->lower_fs[i].bad_uuid = true; return false; + } } return true; } @@ -1277,6 +1281,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) unsigned int i; dev_t dev; int err; + bool bad_uuid = false; /* fsid 0 is reserved for upper fs even with non upper overlay */ if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) @@ -1288,11 +1293,15 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { - ofs->config.index = false; - ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", - uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry); + bad_uuid = true; + if (ofs->config.index || ofs->config.nfs_export) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : + "conflicting", + path->dentry); + } } err = get_anon_bdev(&dev); @@ -1303,6 +1312,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) ofs->lower_fs[ofs->numlowerfs].sb = sb; ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid; ofs->numlowerfs++; return ofs->numlowerfs; diff --git a/fs/pipe.c b/fs/pipe.c index 87109e761fa5..57502c3c0fba 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -364,17 +364,39 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; break; } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } __pipe_unlock(pipe); - if (was_full) { + + /* + * We only get here if we didn't actually read anything. + * + * However, we could have seen (and removed) a zero-sized + * pipe buffer, and might have made space in the buffers + * that way. + * + * You can't make zero-sized pipe buffers by doing an empty + * write (not even in packet mode), but they can happen if + * the writer gets an EFAULT when trying to fill a buffer + * that already got allocated and inserted in the buffer + * array. + * + * So we still need to wake up any pending writers in the + * _very_ unlikely case that the pipe was full, but we got + * no data. + */ + if (unlikely(was_full)) { wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - wait_event_interruptible(pipe->wait, pipe_readable(pipe)); + + /* + * But because we didn't read anything, at this point we can + * just return directly with -ERESTARTSYS if we're interrupted, + * since we've done any required wakeups and there's no need + * to mark anything accessed. And we've dropped the lock. + */ + if (wait_event_interruptible(pipe->wait, pipe_readable(pipe)) < 0) + return -ERESTARTSYS; + __pipe_lock(pipe); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); } @@ -559,7 +581,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } wait_event_interruptible(pipe->wait, pipe_writable(pipe)); __pipe_lock(pipe); - was_empty = pipe_empty(head, pipe->tail); + was_empty = pipe_empty(pipe->head, pipe->tail); } out: __pipe_unlock(pipe); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 37bdbec5b402..fd931d3e77be 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -134,7 +134,7 @@ static int show_stat(struct seq_file *p, void *v) softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; - guest_nice += cpustat[CPUTIME_USER]; + guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -175,7 +175,7 @@ static int show_stat(struct seq_file *p, void *v) softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; - guest_nice = cpustat[CPUTIME_USER]; + guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); diff --git a/fs/verity/enable.c b/fs/verity/enable.c index eabc6ac19906..b79e3fd19d11 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -315,7 +315,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) if (arg.block_size != PAGE_SIZE) return -EINVAL; - if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt)) + if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) return -EMSGSIZE; if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c284e10af491..fc93fd88ec89 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2248,24 +2248,32 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a9ad1f991ba3..4c2e046fbfad 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4561,7 +4561,7 @@ xfs_bmapi_convert_delalloc( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; - u16 flags = 0; + uint16_t flags = 0; struct xfs_trans *tp; int error; @@ -5972,8 +5972,7 @@ xfs_bmap_insert_extents( goto del_cursor; } - if (XFS_IS_CORRUPT(mp, - stop_fsb >= got.br_startoff + got.br_blockcount)) { + if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { error = -EFSCORRUPTED; goto del_cursor; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 0aa87cbde49e..dd6fcaaea318 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -724,3 +724,24 @@ xfs_dir2_namecheck( /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } + +xfs_dahash_t +xfs_dir2_hashname( + struct xfs_mount *mp, + struct xfs_name *name) +{ + if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + return xfs_ascii_ci_hashname(name); + return xfs_da_hashname(name->name, name->len); +} + +enum xfs_dacmp +xfs_dir2_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + return xfs_ascii_ci_compname(args, name, len); + return xfs_da_compname(args, name, len); +} diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index c031c53d0f0d..01ee0b926572 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,6 +175,12 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +int xfs_dir2_sf_entsize(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, int len); +void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino); +void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, @@ -194,25 +200,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -static inline xfs_dahash_t -xfs_dir2_hashname( - struct xfs_mount *mp, - struct xfs_name *name) -{ - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) - return xfs_ascii_ci_hashname(name); - return xfs_da_hashname(name->name, name->len); -} - -static inline enum xfs_dacmp -xfs_dir2_compname( - struct xfs_da_args *args, - const unsigned char *name, - int len) -{ - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) - return xfs_ascii_ci_compname(args, name, len); - return xfs_da_compname(args, name, len); -} +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, + const unsigned char *name, int len); #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8b94d33d232f..7b7f6fb2ea3b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -37,7 +37,7 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); -static int +int xfs_dir2_sf_entsize( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -84,7 +84,7 @@ xfs_dir2_sf_get_ino( return get_unaligned_be64(from) & XFS_MAXINUMBER; } -static void +void xfs_dir2_sf_put_ino( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -145,7 +145,7 @@ xfs_dir2_sf_get_ftype( return XFS_DIR3_FT_UNKNOWN; } -static void +void xfs_dir2_sf_put_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988cde7744e6..5b759af4d165 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2909,3 +2909,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index c55cd9a3dec9..7a9c04920505 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +static unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res( /* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -234,40 +260,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2efd78a9719e..e62fb5216341 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -992,6 +992,7 @@ xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1005,6 +1006,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3458a1264a3f..3984779e5911 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -956,7 +956,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fca65109cf24..56efe140c923 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -360,66 +360,119 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + + return 0; +} + +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + + if (mp->m_dalign) { + bool update_sb; + int error; + + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } return 0; @@ -648,12 +701,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -664,6 +717,17 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c13bb3655e48..a86be7f807ee 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3573,6 +3573,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc); +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |