diff options
Diffstat (limited to 'fs')
52 files changed, 602 insertions, 687 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c index 6bcf1475511b..4763132ca57e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -616,8 +616,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, _debug("write discard %x @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(afs_vnode_cache(vnode), - mapping, start, len, caching); + fscache_clear_page_bits(mapping, start, len, caching); afs_pages_written_back(vnode, start, len); ret = 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6556e13ed95f..63c7ebb0da89 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1117,11 +1117,11 @@ out_free_interp: * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (interpreter || alignment > ELF_MIN_ALIGN) { + if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c22d287e020b..0dd6de994199 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2503,12 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - /* - * New block group is likely to be used soon. Try to activate it now. - * Failure is OK for now. - */ - btrfs_zone_activate(cache); - ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2946,7 +2940,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -3012,7 +3005,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3113,7 +3105,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3171,7 +3162,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { @@ -3455,7 +3445,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3542,7 +3532,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) out: btrfs_trans_release_chunk_metadata(trans); - return ret; + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; } /* @@ -3657,10 +3651,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; + bool from_extent_allocation = false; int ret = 0; + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; @@ -3750,9 +3751,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 93aabc68bb6a..e8308f2ad07d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -35,11 +35,15 @@ enum btrfs_discard_state { * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, }; struct btrfs_caching_control { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index be476f094300..19bf36d8ffea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -537,6 +537,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; + if (blkcg_css) + kthread_associate_blkcg(blkcg_css); + while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; unsigned int index = offset >> PAGE_SHIFT; @@ -555,6 +558,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio = NULL; goto finish_cb; } + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; } /* * We should never reach next_stripe_start start as we will @@ -612,6 +617,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, return 0; finish_cb: + if (blkcg_css) + kthread_associate_blkcg(NULL); + if (bio) { bio->bi_status = ret; bio_endio(bio); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b30309f187cf..126f244cdf88 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1850,9 +1850,10 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f477035a2ac2..6aa92f84f465 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4082,7 +4082,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0399cf8e3c32..151e9da5da2d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -118,7 +118,7 @@ struct btrfs_bio_ctrl { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f455c96c974..380054c94e4b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2957,8 +2957,9 @@ out: return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2990,6 +2991,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } + ret = file_modified(file); + if (ret) + goto out_only_mutex; + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; @@ -3430,7 +3435,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); + return btrfs_punch_hole(file, offset, len); /* * Only trigger disk allocation, don't trigger qgroup reserve @@ -3452,6 +3457,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6bfc4343c98d..5082b9c70f8c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1128,7 +1128,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } @@ -2017,8 +2016,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page * to use run_delalloc_nocow() here, like for regular * preallocated inodes. */ - ASSERT(!zoned || - (zoned && btrfs_is_data_reloc_root(inode->root))); + ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) || @@ -4488,6 +4486,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -7438,6 +7443,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, u64 block_start, orig_start, orig_block_len, ram_bytes; bool can_nocow = false; bool space_reserved = false; + u64 prev_len; int ret = 0; /* @@ -7465,6 +7471,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, can_nocow = true; } + prev_len = len; if (can_nocow) { struct extent_map *em2; @@ -7494,8 +7501,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; } } else { - const u64 prev_len = len; - /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; @@ -7526,7 +7531,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ - btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered @@ -11107,8 +11112,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 238cee5b5254..be6c24577dbe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1239,7 +1239,7 @@ static u32 get_extent_max_capacity(const struct extent_map *em) } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - bool locked) + u32 extent_thresh, u64 newer_than, bool locked) { struct extent_map *next; bool ret = false; @@ -1249,11 +1249,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, return false; /* - * We want to check if the next extent can be merged with the current - * one, which can be an extent created in a past generation, so we pass - * a minimum generation of 0 to defrag_lookup_extent(). + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. */ - next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; @@ -1265,6 +1266,13 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ if (next->len >= get_extent_max_capacity(em)) goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + ret = true; out: free_extent_map(next); @@ -1470,7 +1478,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto next; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - locked); + extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; @@ -5448,8 +5456,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1be7cb2f955f..a8cc736731fd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2240,7 +2231,7 @@ error_undo: device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) @@ -4439,10 +4430,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b7b5fac1c779..1b1b310c3c51 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1801,7 +1801,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, map = em->map_lookup; /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); device = map->stripes[0].dev; free_extent_map(em); @@ -1976,18 +1975,16 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { + struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; - if (!btrfs_is_zoned(fs_devices->fs_info)) + if (!btrfs_is_zoned(fs_info)) return true; - /* Non-single profiles are not supported yet */ - ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); - /* Check if there is a device with active zones left */ - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + mutex_lock(&fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; if (!device->bdev) @@ -1999,7 +1996,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } } - mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f256c8aff7bb..ca9f3e4ec4b3 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -57,6 +57,16 @@ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, trace_cachefiles_mark_inactive(object, inode); } +static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) +{ + struct inode *inode = d_backing_inode(dentry); + + inode_lock(inode); + __cachefiles_unmark_inode_in_use(object, dentry); + inode_unlock(inode); +} + /* * Unmark a backing inode and tell cachefilesd that there's something that can * be culled. @@ -68,9 +78,7 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct inode *inode = file_inode(file); if (inode) { - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, file->f_path.dentry); - inode_unlock(inode); + cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { atomic_long_add(inode->i_blocks, &cache->b_released); @@ -484,7 +492,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) object, d_backing_inode(path.dentry), ret, cachefiles_trace_trunc_error); file = ERR_PTR(ret); - goto out_dput; + goto out_unuse; } } @@ -494,15 +502,20 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), PTR_ERR(file), cachefiles_trace_open_error); - goto out_dput; + goto out_unuse; } if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); file = ERR_PTR(-EINVAL); + goto out_unuse; } + goto out_dput; + +out_unuse: + cachefiles_do_unmark_inode_in_use(object, path.dentry); out_dput: dput(path.dentry); out: @@ -590,14 +603,16 @@ static bool cachefiles_open_file(struct cachefiles_object *object, check_failed: fscache_cookie_lookup_negative(object->cookie); cachefiles_unmark_inode_in_use(object, file); - if (ret == -ESTALE) { - fput(file); - dput(dentry); + fput(file); + dput(dentry); + if (ret == -ESTALE) return cachefiles_create_file(object); - } + return false; + error_fput: fput(file); error: + cachefiles_do_unmark_inode_in_use(object, dentry); dput(dentry); return false; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 35465109d9c4..00b087c14995 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -203,7 +203,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) if (!buf) return false; buf->reserved = cpu_to_be32(0); - memcpy(buf->data, p, len); + memcpy(buf->data, p, volume->vcookie->coherency_len); ret = cachefiles_inject_write_error(); if (ret == 0) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a47fa44b6d52..2b1a1c029c75 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -266,22 +266,24 @@ static void cifs_kill_sb(struct super_block *sb) * before we kill the sb. */ if (cifs_sb->root) { + for (node = rb_first(root); node; node = rb_next(node)) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + tcon = tlink_tcon(tlink); + if (IS_ERR(tcon)) + continue; + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + } + + /* finally release root dentry */ dput(cifs_sb->root); cifs_sb->root = NULL; } - node = rb_first(root); - while (node != NULL) { - tlink = rb_entry(node, struct tcon_link, tl_rbnode); - tcon = tlink_tcon(tlink); - cfid = &tcon->crfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - mutex_unlock(&cfid->fid_mutex); - node = rb_next(node); - } kill_anon_super(sb); cifs_umount(cifs_sb); @@ -944,7 +946,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 15a5c5db038b..c0542bdcd06b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,5 +153,5 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ #define SMB3_PRODUCT_BUILD 35 -#define CIFS_VERSION "2.35" +#define CIFS_VERSION "2.36" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ee3b7c15e884..42e14f408856 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -453,9 +453,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ return rc; } -static int -reconnect_dfs_server(struct TCP_Server_Info *server, - bool mark_smb_session) +static int reconnect_dfs_server(struct TCP_Server_Info *server) { int rc = 0; const char *refpath = server->current_fullpath + 1; @@ -479,7 +477,12 @@ reconnect_dfs_server(struct TCP_Server_Info *server, if (!cifs_tcp_ses_needs_reconnect(server, num_targets)) return 0; - cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); + /* + * Unconditionally mark all sessions & tcons for reconnect as we might be connecting to a + * different server or share during failover. It could be improved by adding some logic to + * only do that in case it connects to a different server or share, though. + */ + cifs_mark_tcp_ses_conns_for_reconnect(server, true); cifs_abort_connection(server); @@ -531,13 +534,20 @@ int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ spin_lock(&cifs_tcp_ses_lock); - if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) { + if (!server->is_dfs_conn) { spin_unlock(&cifs_tcp_ses_lock); return __cifs_reconnect(server, mark_smb_session); } spin_unlock(&cifs_tcp_ses_lock); - return reconnect_dfs_server(server, mark_smb_session); + mutex_lock(&server->refpath_lock); + if (!server->origin_fullpath || !server->leaf_fullpath) { + mutex_unlock(&server->refpath_lock); + return __cifs_reconnect(server, mark_smb_session); + } + mutex_unlock(&server->refpath_lock); + + return reconnect_dfs_server(server); } #else int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) @@ -1046,7 +1056,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_hdr_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, le16_to_cpu(shdr->CreditRequest), in_flight); cifs_server_dbg(FYI, "%s: added %u credits total=%d\n", @@ -3672,9 +3682,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx) { struct TCP_Server_Info *server = mnt_ctx->server; + mutex_lock(&server->refpath_lock); server->origin_fullpath = mnt_ctx->origin_fullpath; server->leaf_fullpath = mnt_ctx->leaf_fullpath; server->current_fullpath = mnt_ctx->leaf_fullpath; + mutex_unlock(&server->refpath_lock); mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL; } @@ -4465,7 +4477,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - cifs_reconnect(tcon->ses->server, true); + cifs_signal_cifsd_for_reconnect(server, true); } dfs_cache_free_tgts(tl); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 30e040da4f09..956f8e5cf3e7 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1422,12 +1422,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool struct TCP_Server_Info *server = tcon->ses->server; mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); - return 0; } @@ -1530,11 +1532,14 @@ static void refresh_mounts(struct cifs_ses **sessions) list_del_init(&tcon->ulist); mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); cifs_put_tcon(tcon); } } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 852e54ee82c2..bbdf3281559c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (rc != 1) return -EINVAL; + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -EINVAL; + rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index ebe236b9d9f5..235aa1b395eb 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -896,7 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); - cifs_reconnect(mid->server, false); + cifs_signal_cifsd_for_reconnect(mid->server, false); } } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index c653beb735b8..3fe47a88f47d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -150,16 +150,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; struct cifs_ses *ses = NULL; + struct cifs_ses *iter; /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { - if (ses->Suid == le64_to_cpu(thdr->SessionId)) + list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) { + if (iter->Suid == le64_to_cpu(thdr->SessionId)) { + ses = iter; break; + } } spin_unlock(&cifs_tcp_ses_lock); - if (list_entry_is_head(ses, &srvr->smb_ses_list, - smb_ses_list)) { + if (!ses) { cifs_dbg(VFS, "no decryption - session id not found\n"); return 1; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index db23f5b404ba..d6aaeff4a30a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -86,6 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server, if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ pr_warn_once("server overflowed SMB3 credits\n"); + trace_smb3_overflow_credits(server->CurrentMid, + server->conn_id, server->hostname, *val, + add, server->in_flight); } server->in_flight--; if (server->in_flight == 0 && @@ -251,7 +254,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_wait_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -(credits->value), in_flight); cifs_dbg(FYI, "%s: removed %u credits total=%d\n", __func__, credits->value, scredits); @@ -300,7 +303,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_adj_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, credits->value - new_val, in_flight); cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n", @@ -1855,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid, int chunks_copied = 0; bool chunk_sizes_updated = false; ssize_t bytes_written, total_bytes_written = 0; + struct inode *inode; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + /* + * We need to flush all unwritten data before we can send the + * copychunk ioctl to the server. + */ + inode = d_inode(trgtfile->dentry); + filemap_write_and_wait(inode->i_mapping); + if (pcchunk == NULL) return -ENOMEM; @@ -2492,7 +2503,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_pend_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, le16_to_cpu(shdr->CreditRequest), in_flight); cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n", diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6cecf302dcfd..bc279616c513 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -1006,6 +1006,13 @@ DEFINE_SMB3_CREDIT_EVENT(credit_timeout); DEFINE_SMB3_CREDIT_EVENT(insufficient_credits); DEFINE_SMB3_CREDIT_EVENT(too_many_credits); DEFINE_SMB3_CREDIT_EVENT(add_credits); +DEFINE_SMB3_CREDIT_EVENT(adj_credits); +DEFINE_SMB3_CREDIT_EVENT(hdr_credits); +DEFINE_SMB3_CREDIT_EVENT(nblk_credits); +DEFINE_SMB3_CREDIT_EVENT(pend_credits); +DEFINE_SMB3_CREDIT_EVENT(wait_credits); +DEFINE_SMB3_CREDIT_EVENT(waitff_credits); +DEFINE_SMB3_CREDIT_EVENT(overflow_credits); DEFINE_SMB3_CREDIT_EVENT(set_credits); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index eeb1a699bd6f..c667e6ddfe2f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -464,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS); + tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS); if (!tr_hdr) return -ENOMEM; memset(&cur_rqst[0], 0, sizeof(cur_rqst)); memset(&iov, 0, sizeof(iov)); - memset(tr_hdr, 0, sizeof(*tr_hdr)); iov.iov_base = tr_hdr; iov.iov_len = sizeof(*tr_hdr); @@ -542,7 +541,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_nblk_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -1, in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", __func__, 1, scredits); @@ -648,7 +647,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_waitff_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -(num_credits), in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0ed880f42525..e6dea6dfca16 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1066,12 +1066,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* wake up the caller thread for sync decompression */ if (sync) { - unsigned long flags; - - spin_lock_irqsave(&io->u.wait.lock, flags); if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); + complete(&io->u.done); + return; } @@ -1217,7 +1214,7 @@ jobqueue_init(struct super_block *sb, } else { fg_out: q = fgq; - init_waitqueue_head(&fgq->u.wait); + init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); } q->sb = sb; @@ -1419,8 +1416,7 @@ static void z_erofs_runqueue(struct super_block *sb, return; /* wait until all bios are completed */ - io_wait_event(io[JQ_SUBMIT].u.wait, - !atomic_read(&io[JQ_SUBMIT].pending_bios)); + wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e043216b545f..800b11c53f57 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -97,7 +97,7 @@ struct z_erofs_decompressqueue { z_erofs_next_pcluster_t head; union { - wait_queue_head_t wait; + struct completion done; struct work_struct work; } u; }; diff --git a/fs/file_table.c b/fs/file_table.c index 7d2e692b66a9..ada8fe814db9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -412,6 +412,7 @@ void __fput_sync(struct file *file) } EXPORT_SYMBOL(fput); +EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 76316c4a3fb7..b313a978ae0a 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -38,6 +38,3 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_OLD_API - bool diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 2749933852a9..d645f8b302a2 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -214,7 +214,7 @@ void fscache_relinquish_cache(struct fscache_cache *cache) cache->ops = NULL; cache->cache_priv = NULL; - smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT); + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT); fscache_put_cache(cache, where); } EXPORT_SYMBOL(fscache_relinquish_cache); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9bb1ab5fe5ed..9d3cf0111709 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -30,7 +30,7 @@ static DEFINE_SPINLOCK(fscache_cookie_lru_lock); DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; -unsigned int fscache_lru_cookie_timeout = 10 * HZ; +static unsigned int fscache_lru_cookie_timeout = 10 * HZ; void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { @@ -1069,6 +1069,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, } EXPORT_SYMBOL(__fscache_invalidate); +#ifdef CONFIG_PROC_FS /* * Generate a list of extant cookies in /proc/fs/fscache/cookies */ @@ -1145,3 +1146,4 @@ const struct seq_operations fscache_cookies_seq_ops = { .stop = fscache_cookies_seq_stop, .show = fscache_cookies_seq_show, }; +#endif diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index ed1c9ed737f2..1336f517e9b1 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -56,7 +56,9 @@ static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS extern const struct seq_operations fscache_cookies_seq_ops; +#endif extern struct timer_list fscache_cookie_lru_timer; extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); @@ -137,7 +139,9 @@ int fscache_stats_show(struct seq_file *m, void *v); /* * volume.c */ +#ifdef CONFIG_PROC_FS extern const struct seq_operations fscache_volumes_seq_ops; +#endif struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, enum fscache_volume_trace where); diff --git a/fs/fscache/io.c b/fs/fscache/io.c index c8c7fe9e9a6e..3af3b08a9bb3 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -235,8 +235,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, { struct fscache_write_request *wreq = priv; - fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources), - wreq->mapping, wreq->start, wreq->len, + fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len, wreq->set_bits); if (wreq->term_func) @@ -296,7 +295,7 @@ abandon_end: abandon_free: kfree(wreq); abandon: - fscache_clear_page_bits(cookie, mapping, start, len, cond); + fscache_clear_page_bits(mapping, start, len, cond); if (term_func) term_func(term_func_priv, ret, false); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 99c7477cee5c..dd3a088db11d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/io_uring.c b/fs/io_uring.c index a8413f006417..4479013854d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -63,7 +63,6 @@ #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> -#include <net/busy_poll.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -112,8 +111,7 @@ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -412,11 +410,6 @@ struct io_ring_ctx { struct list_head sqd_list; unsigned long check_cq_overflow; -#ifdef CONFIG_NET_RX_BUSY_POLL - /* used to track busy poll napi_id */ - struct list_head napi_list; - spinlock_t napi_lock; /* napi_list lock */ -#endif struct { unsigned cached_cq_tail; @@ -500,7 +493,6 @@ struct io_uring_task { const struct io_ring_ctx *last; struct io_wq *io_wq; struct percpu_counter inflight; - atomic_t inflight_tracked; atomic_t in_idle; spinlock_t task_lock; @@ -592,7 +584,8 @@ struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; u64 addr; - u64 len; + u32 len; + u32 flags; }; struct io_connect { @@ -654,10 +647,10 @@ struct io_epoll { struct io_splice { struct file *file_out; - struct file *file_in; loff_t off_out; loff_t off_in; u64 len; + int splice_fd_in; unsigned int flags; }; @@ -914,7 +907,11 @@ struct io_kiocb { u64 user_data; u32 result; - u32 cflags; + /* fd initially, then cflags for completion */ + union { + u32 cflags; + int fd; + }; struct io_ring_ctx *ctx; struct task_struct *task; @@ -923,8 +920,12 @@ struct io_kiocb { /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; + union { + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + /* cache ->apoll->events */ + int apoll_events; + }; atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; @@ -1182,8 +1183,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed); +static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned issue_flags); +static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static void io_drop_inflight_file(struct io_kiocb *req); +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1313,13 +1317,20 @@ static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) } static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx) + struct io_ring_ctx *ctx, + unsigned int issue_flags) { if (!req->fixed_rsrc_refs) { req->fixed_rsrc_refs = &ctx->rsrc_node->refs; - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + lockdep_assert_held(&ctx->uring_lock); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); + } else { + percpu_ref_get(req->fixed_rsrc_refs); + } } } @@ -1424,29 +1435,9 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) { - struct io_kiocb *req; - if (task && head->task != task) return false; - if (cancel_all) - return true; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; + return cancel_all; } /* @@ -1456,24 +1447,9 @@ static bool io_match_linked(struct io_kiocb *head) static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all) { - bool matched; - if (task && head->task != task) return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; + return cancel_all; } static inline bool req_has_async_data(struct io_kiocb *req) @@ -1595,10 +1571,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); -#ifdef CONFIG_NET_RX_BUSY_POLL - INIT_LIST_HEAD(&ctx->napi_list); - spin_lock_init(&ctx->napi_lock); -#endif return ctx; err: kfree(ctx->dummy_ubuf); @@ -1636,14 +1608,6 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } -static inline void io_req_track_inflight(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_INFLIGHT)) { - req->flags |= REQ_F_INFLIGHT; - atomic_inc(¤t->io_uring->inflight_tracked); - } -} - static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -1687,14 +1651,6 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - switch (req->opcode) { - case IORING_OP_SPLICE: - case IORING_OP_TEE: - if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } } static void io_prep_async_link(struct io_kiocb *req) @@ -1788,12 +1744,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->timeout_lock); - while (!list_empty(&ctx->timeout_list)) { + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { u32 events_needed, events_got; - struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1810,7 +1765,6 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - list_del_init(&req->timeout.list); io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; @@ -2562,6 +2516,8 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) WARN_ON_ONCE(!tctx); + io_drop_inflight_file(req); + spin_lock_irqsave(&tctx->task_lock, flags); if (priority) wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); @@ -3186,42 +3142,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; unsigned ioprio; int ret; - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - kiocb->ki_pos = READ_ONCE(sqe->off); - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -3237,6 +3162,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); + req->rw.flags = READ_ONCE(sqe->rw_flags); req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3265,19 +3191,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; - bool is_stream = req->file->f_mode & FMODE_STREAM; - if (kiocb->ki_pos == -1) { - if (!is_stream) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; - return &kiocb->ki_pos; - } else { - kiocb->ki_pos = 0; - return NULL; - } + if (kiocb->ki_pos != -1) + return &kiocb->ki_pos; + + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + return &kiocb->ki_pos; } - return is_stream ? NULL : &kiocb->ki_pos; + + kiocb->ki_pos = 0; + return NULL; } static void kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -3367,7 +3292,8 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter return 0; } -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, + unsigned int issue_flags) { struct io_mapped_ubuf *imu = req->imu; u16 index, buf_index = req->buf_index; @@ -3377,7 +3303,7 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; - io_req_set_rsrc_node(req, ctx); + io_req_set_rsrc_node(req, ctx, issue_flags); index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = READ_ONCE(ctx->user_bufs[index]); req->imu = imu; @@ -3539,7 +3465,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, rw, iter); + ret = io_import_fixed(req, rw, iter, issue_flags); if (ret) return ERR_PTR(ret); return NULL; @@ -3740,13 +3666,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) return 0; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; - return io_prep_rw(req, sqe); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -3834,6 +3753,49 @@ static bool need_read_all(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +{ + struct kiocb *kiocb = &req->rw.kiocb; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (unlikely(!file || !(file->f_mode & mode))) + return -EBADF; + + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + + kiocb->ki_flags = iocb_flags(file); + ret = kiocb_set_rw_flags(kiocb, req->rw.flags); + if (unlikely(ret)) + return ret; + + /* + * If the file is marked O_NONBLOCK, still allow retry for it if it + * supports async. Otherwise it's impossible to use O_NONBLOCK files + * reliably. If not, or it IOCB_NOWAIT is set, don't retry. + */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + req->flags |= REQ_F_NOWAIT; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) + return -EOPNOTSUPP; + + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; + kiocb->ki_complete = io_complete_rw_iopoll; + req->iopoll_completed = 0; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + kiocb->ki_complete = io_complete_rw; + } + + return 0; +} + static int io_read(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3869,6 +3831,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } + ret = io_rw_init_file(req, FMODE_READ); + if (unlikely(ret)) + return ret; req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -3972,13 +3937,6 @@ out_free: return 0; } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; - return io_prep_rw(req, sqe); -} - static int io_write(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3999,6 +3957,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } + ret = io_rw_init_file(req, FMODE_WRITE); + if (unlikely(ret)) + return ret; req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -4369,18 +4330,11 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - sp->file_in = NULL; sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - - sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), - (sp->flags & SPLICE_F_FD_IN_FIXED)); - if (!sp->file_in) - return -EBADF; - req->flags |= REQ_F_NEED_CLEANUP; + sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); return 0; } @@ -4395,20 +4349,29 @@ static int io_tee_prep(struct io_kiocb *req, static int io_tee(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; - struct file *in = sp->file_in; struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + struct file *in; long ret = 0; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + if (sp->len) ret = do_tee(in, out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) io_put_file(in); - req->flags &= ~REQ_F_NEED_CLEANUP; - +done: if (ret != sp->len) req_set_fail(req); io_req_complete(req, ret); @@ -4427,15 +4390,24 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_splice(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = &req->splice; - struct file *in = sp->file_in; struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; + struct file *in; long ret = 0; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; @@ -4444,8 +4416,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) io_put_file(in); - req->flags &= ~REQ_F_NEED_CLEANUP; - +done: if (ret != sp->len) req_set_fail(req); io_req_complete(req, ret); @@ -4513,9 +4484,6 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - if (!req->file) - return -EBADF; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || @@ -5757,108 +5725,6 @@ IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ -#ifdef CONFIG_NET_RX_BUSY_POLL - -#define NAPI_TIMEOUT (60 * SEC_CONVERSION) - -struct napi_entry { - struct list_head list; - unsigned int napi_id; - unsigned long timeout; -}; - -/* - * Add busy poll NAPI ID from sk. - */ -static void io_add_napi(struct file *file, struct io_ring_ctx *ctx) -{ - unsigned int napi_id; - struct socket *sock; - struct sock *sk; - struct napi_entry *ne; - - if (!net_busy_loop_on()) - return; - - sock = sock_from_file(file); - if (!sock) - return; - - sk = sock->sk; - if (!sk) - return; - - napi_id = READ_ONCE(sk->sk_napi_id); - - /* Non-NAPI IDs can be rejected */ - if (napi_id < MIN_NAPI_ID) - return; - - spin_lock(&ctx->napi_lock); - list_for_each_entry(ne, &ctx->napi_list, list) { - if (ne->napi_id == napi_id) { - ne->timeout = jiffies + NAPI_TIMEOUT; - goto out; - } - } - - ne = kmalloc(sizeof(*ne), GFP_NOWAIT); - if (!ne) - goto out; - - ne->napi_id = napi_id; - ne->timeout = jiffies + NAPI_TIMEOUT; - list_add_tail(&ne->list, &ctx->napi_list); -out: - spin_unlock(&ctx->napi_lock); -} - -static inline void io_check_napi_entry_timeout(struct napi_entry *ne) -{ - if (time_after(jiffies, ne->timeout)) { - list_del(&ne->list); - kfree(ne); - } -} - -/* - * Busy poll if globally on and supporting sockets found - */ -static bool io_napi_busy_loop(struct list_head *napi_list) -{ - struct napi_entry *ne, *n; - - list_for_each_entry_safe(ne, n, napi_list, list) { - napi_busy_loop(ne->napi_id, NULL, NULL, true, - BUSY_POLL_BUDGET); - io_check_napi_entry_timeout(ne); - } - return !list_empty(napi_list); -} - -static void io_free_napi_list(struct io_ring_ctx *ctx) -{ - spin_lock(&ctx->napi_lock); - while (!list_empty(&ctx->napi_list)) { - struct napi_entry *ne = - list_first_entry(&ctx->napi_list, struct napi_entry, - list); - - list_del(&ne->list); - kfree(ne); - } - spin_unlock(&ctx->napi_lock); -} -#else -static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx) -{ -} - -static inline void io_free_napi_list(struct io_ring_ctx *ctx) -{ -} -#endif /* CONFIG_NET_RX_BUSY_POLL */ - struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; @@ -5972,10 +5838,9 @@ static void io_poll_remove_entries(struct io_kiocb *req) * either spurious wakeup or multishot CQE is served. 0 when it's done with * the request, then the mask is stored in req->result. */ -static int io_poll_check_events(struct io_kiocb *req) +static int io_poll_check_events(struct io_kiocb *req, bool locked) { struct io_ring_ctx *ctx = req->ctx; - struct io_poll_iocb *poll = io_poll_get_single(req); int v; /* req->task == current here, checking PF_EXITING is safe */ @@ -5992,14 +5857,17 @@ static int io_poll_check_events(struct io_kiocb *req) return -ECANCELED; if (!req->result) { - struct poll_table_struct pt = { ._key = req->cflags }; + struct poll_table_struct pt = { ._key = req->apoll_events }; + unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; - req->result = vfs_poll(req->file, &pt) & req->cflags; + if (unlikely(!io_assign_file(req, flags))) + return -EBADF; + req->result = vfs_poll(req->file, &pt) & req->apoll_events; } /* multishot, just fill an CQE and proceed */ - if (req->result && !(req->cflags & EPOLLONESHOT)) { - __poll_t mask = mangle_poll(req->result & poll->events); + if (req->result && !(req->apoll_events & EPOLLONESHOT)) { + __poll_t mask = mangle_poll(req->result & req->apoll_events); bool filled; spin_lock(&ctx->completion_lock); @@ -6010,7 +5878,6 @@ static int io_poll_check_events(struct io_kiocb *req) if (unlikely(!filled)) return -ECANCELED; io_cqring_ev_posted(ctx); - io_add_napi(req->file, ctx); } else if (req->result) { return 0; } @@ -6029,7 +5896,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req); + ret = io_poll_check_events(req, *locked); if (ret > 0) return; @@ -6054,7 +5921,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req); + ret = io_poll_check_events(req, *locked); if (ret > 0) return; @@ -6078,7 +5945,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events) * CPU. We want to avoid pulling in req->apoll->events for that * case. */ - req->cflags = events; + req->apoll_events = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else @@ -6261,7 +6128,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, __io_poll_execute(req, mask, poll->events); return 0; } - io_add_napi(req->file, req->ctx); /* * Release ownership. If someone tried to queue a tw while it was @@ -6471,7 +6337,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - req->cflags = poll->events = io_poll_parse_events(sqe, flags); + req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -6766,6 +6632,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; + INIT_LIST_HEAD(&req->timeout.list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -6972,6 +6839,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.nr = 0; up.tags = 0; up.resv = 0; + up.resv2 = 0; io_ring_submit_lock(ctx, needs_lock); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, @@ -6992,11 +6860,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - return io_read_prep(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - return io_write_prep(req, sqe); + return io_prep_rw(req, sqe); case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: @@ -7179,11 +7046,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(io->free_iov); break; } - case IORING_OP_SPLICE: - case IORING_OP_TEE: - if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(req->splice.file_in); - break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: if (req->open.filename) @@ -7218,11 +7080,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -7232,11 +7089,31 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req->file || !io_op_defs[req->opcode].needs_file) + return true; + + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->fd, issue_flags); + else + req->file = io_file_get_normal(req, req->fd); + if (req->file) + return true; + + req_set_fail(req); + req->result = -EBADF; + return false; +} + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct cred *creds = NULL; int ret; + if (unlikely(!io_assign_file(req, issue_flags))) + return -EBADF; + if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); @@ -7386,10 +7263,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; struct io_kiocb *timeout; - int ret = 0; + int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) @@ -7401,14 +7279,20 @@ static void io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { - io_req_task_queue_fail(req, -ECANCELED); +fail: + io_req_task_queue_fail(req, err); return; } + if (!io_assign_file(req, issue_flags)) { + err = -EBADF; + work->flags |= IO_WQ_WORK_CANCEL; + goto fail; + } if (req->flags & REQ_F_FORCE_ASYNC) { - const struct io_op_def *def = &io_op_defs[req->opcode]; bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { @@ -7465,46 +7349,56 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file file_slot->file_ptr = file_ptr; } -static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd) +static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned int issue_flags) { - struct file *file; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = NULL; unsigned long file_ptr; + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; + goto out; fd = array_index_nospec(fd, ctx->nr_user_files); file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; file = (struct file *) (file_ptr & FFS_MASK); file_ptr &= ~FFS_MASK; /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); - io_req_set_rsrc_node(req, ctx); + io_req_set_rsrc_node(req, ctx, 0); +out: + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); return file; } -static struct file *io_file_get_normal(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd) +/* + * Drop the file for requeue operations. Only used of req->file is the + * io_uring descriptor itself. + */ +static void io_drop_inflight_file(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_INFLIGHT)) { + fput(req->file); + req->file = NULL; + req->flags &= ~REQ_F_INFLIGHT; + } +} + +static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(ctx, req, req->user_data, fd); + trace_io_uring_file_get(req->ctx, req, req->user_data, fd); /* we don't allow fixed io_uring files */ - if (file && unlikely(file->f_op == &io_uring_fops)) - io_req_track_inflight(req); + if (file && file->f_op == &io_uring_fops) + req->flags |= REQ_F_INFLIGHT; return file; } -static inline struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed) -{ - if (fixed) - return io_file_get_fixed(ctx, req, fd); - else - return io_file_get_normal(ctx, req, fd); -} - static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { struct io_kiocb *prev = req->timeout.prev; @@ -7744,6 +7638,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; + req->fd = READ_ONCE(sqe->fd); + /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. @@ -7753,11 +7649,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } - - req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); - if (unlikely(!req->file)) - return -EBADF; } personality = READ_ONCE(sqe->personality); @@ -8032,13 +7923,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); -#ifdef CONFIG_NET_RX_BUSY_POLL - spin_lock(&ctx->napi_lock); - if (!list_empty(&ctx->napi_list) && - io_napi_busy_loop(&ctx->napi_list)) - ++ret; - spin_unlock(&ctx->napi_lock); -#endif + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); if (creds) @@ -8176,9 +8061,6 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; -#ifdef CONFIG_NET_RX_BUSY_POLL - unsigned busy_poll_to; -#endif }; static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -8240,87 +8122,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, return 1; } -#ifdef CONFIG_NET_RX_BUSY_POLL -static void io_adjust_busy_loop_timeout(struct timespec64 *ts, - struct io_wait_queue *iowq) -{ - unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll); - struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to); - - if (timespec64_compare(ts, &pollto) > 0) { - *ts = timespec64_sub(*ts, pollto); - iowq->busy_poll_to = busy_poll_to; - } else { - u64 to = timespec64_to_ns(ts); - - do_div(to, 1000); - iowq->busy_poll_to = to; - ts->tv_sec = 0; - ts->tv_nsec = 0; - } -} - -static inline bool io_busy_loop_timeout(unsigned long start_time, - unsigned long bp_usec) -{ - if (bp_usec) { - unsigned long end_time = start_time + bp_usec; - unsigned long now = busy_loop_current_time(); - - return time_after(now, end_time); - } - return true; -} - -static bool io_busy_loop_end(void *p, unsigned long start_time) -{ - struct io_wait_queue *iowq = p; - - return signal_pending(current) || - io_should_wake(iowq) || - io_busy_loop_timeout(start_time, iowq->busy_poll_to); -} - -static void io_blocking_napi_busy_loop(struct list_head *napi_list, - struct io_wait_queue *iowq) -{ - unsigned long start_time = - list_is_singular(napi_list) ? 0 : - busy_loop_current_time(); - - do { - if (list_is_singular(napi_list)) { - struct napi_entry *ne = - list_first_entry(napi_list, - struct napi_entry, list); - - napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq, - true, BUSY_POLL_BUDGET); - io_check_napi_entry_timeout(ne); - break; - } - } while (io_napi_busy_loop(napi_list) && - !io_busy_loop_end(iowq, start_time)); -} - -static void io_putback_napi_list(struct io_ring_ctx *ctx, - struct list_head *napi_list) -{ - struct napi_entry *cne, *lne; - - spin_lock(&ctx->napi_lock); - list_for_each_entry(cne, &ctx->napi_list, list) - list_for_each_entry(lne, napi_list, list) - if (cne->napi_id == lne->napi_id) { - list_del(&lne->list); - kfree(lne); - break; - } - list_splice(napi_list, &ctx->napi_list); - spin_unlock(&ctx->napi_lock); -} -#endif /* CONFIG_NET_RX_BUSY_POLL */ - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -8333,9 +8134,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; ktime_t timeout = KTIME_MAX; int ret; -#ifdef CONFIG_NET_RX_BUSY_POLL - LIST_HEAD(local_napi_list); -#endif do { io_cqring_overflow_flush(ctx); @@ -8358,29 +8156,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } -#ifdef CONFIG_NET_RX_BUSY_POLL - iowq.busy_poll_to = 0; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) { - spin_lock(&ctx->napi_lock); - list_splice_init(&ctx->napi_list, &local_napi_list); - spin_unlock(&ctx->napi_lock); - } -#endif if (uts) { struct timespec64 ts; if (get_timespec64(&ts, uts)) return -EFAULT; -#ifdef CONFIG_NET_RX_BUSY_POLL - if (!list_empty(&local_napi_list)) - io_adjust_busy_loop_timeout(&ts, &iowq); -#endif timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } -#ifdef CONFIG_NET_RX_BUSY_POLL - else if (!list_empty(&local_napi_list)) - iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll); -#endif init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; @@ -8390,12 +8172,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; trace_io_uring_cqring_wait(ctx, min_events); -#ifdef CONFIG_NET_RX_BUSY_POLL - if (iowq.busy_poll_to) - io_blocking_napi_busy_loop(&local_napi_list, &iowq); - if (!list_empty(&local_napi_list)) - io_putback_napi_list(ctx, &local_napi_list); -#endif do { /* if we can't even flush overflow, don't wait for more */ if (!io_cqring_overflow_flush(ctx)) { @@ -8864,8 +8640,12 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); - for (i = 0; i < nr_files; i++) - fput(fpl->fp[i]); + for (i = 0; i < nr; i++) { + struct file *file = io_file_from_index(ctx, i + offset); + + if (file) + fput(file); + } } else { kfree_skb(skb); free_uid(fpl->user); @@ -9156,13 +8936,15 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { + u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); if (!prsrc) return -ENOMEM; - prsrc->tag = *io_get_tag_slot(data, idx); + prsrc->tag = *tag_slot; + *tag_slot = 0; prsrc->rsrc = rsrc; list_add(&prsrc->list, &node->rsrc_list); return 0; @@ -9231,7 +9013,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; - int ret, i; + int ret; io_ring_submit_lock(ctx, needs_lock); ret = -ENXIO; @@ -9244,8 +9026,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) if (ret) goto out; - i = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); ret = -EBADF; if (!file_slot->file_ptr) goto out; @@ -9301,8 +9083,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (file_slot->file_ptr) { file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, up->offset + done, - ctx->rsrc_node, file); + err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); if (err) break; file_slot->file_ptr = 0; @@ -9327,7 +9108,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, up->offset + done) = tag; + *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); err = io_sqe_file_register(ctx, file, i); if (err) { @@ -9411,7 +9192,6 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_idle, 0); - atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -9986,7 +9766,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(offset, ctx->nr_user_bufs); if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, offset, + err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->rsrc_node, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); @@ -10181,7 +9961,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); - io_free_napi_list(ctx); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); kfree(ctx->io_buffers); @@ -10604,7 +10383,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) - return atomic_read(&tctx->inflight_tracked); + return 0; return percpu_counter_sum(&tctx->inflight); } @@ -10755,6 +10534,11 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, break; } + if (reg.resv) { + ret = -EINVAL; + break; + } + if (reg.offset == -1U) { start = 0; end = IO_RINGFD_REG_MAX; @@ -10801,7 +10585,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, ret = -EFAULT; break; } - if (reg.offset >= IO_RINGFD_REG_MAX) { + if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } @@ -10928,6 +10712,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; + if (arg.pad) + return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); *argsz = arg.sigmask_sz; *ts = u64_to_user_ptr(arg.ts); @@ -11409,7 +11195,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; + IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | + IORING_FEAT_LINKED_FILE; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -11620,8 +11407,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, __u32 tmp; int err; - if (up->resv) - return -EINVAL; if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; err = io_rsrc_node_switch_start(ctx); @@ -11647,6 +11432,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; + if (up.resv || up.resv2) + return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } @@ -11659,7 +11446,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr || up.resv) + if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } @@ -11707,7 +11494,15 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, if (len > cpumask_size()) len = cpumask_size(); - if (copy_from_user(new_mask, arg, len)) { + if (in_compat_syscall()) { + ret = compat_get_bitmap(cpumask_bits(new_mask), + (const compat_ulong_t __user *)arg, + len * 8 /* CHAR_BIT */); + } else { + ret = copy_from_user(new_mask, arg, len); + } + + if (ret) { free_cpumask_var(new_mask); return -EFAULT; } diff --git a/fs/namei.c b/fs/namei.c index 3f1829b3ab5b..509657fdf4f5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3673,18 +3673,14 @@ static struct dentry *filename_create(int dfd, struct filename *name, { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; + bool want_dir = lookup_flags & LOOKUP_DIRECTORY; + unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; + unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; - bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); - /* - * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any - * other flags passed in are ignored! - */ - lookup_flags &= LOOKUP_REVAL; - - error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); @@ -3698,11 +3694,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* - * Do the final lookup. + * Do the final lookup. Suppress 'create' if there is a trailing + * '/', and a directory wasn't requested. */ - lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + if (last.name[last.len] && !want_dir) + create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, lookup_flags); + dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; @@ -3716,7 +3714,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && last.name[last.len])) { + if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } diff --git a/fs/namespace.c b/fs/namespace.c index a0a36bfa3aa0..afe2b64b14f1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4058,10 +4058,22 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (err) { struct mount *p; - for (p = mnt; p != m; p = next_mnt(p, mnt)) { + /* + * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will + * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all + * mounts and needs to take care to include the first mount. + */ + for (p = mnt; p; p = next_mnt(p, mnt)) { /* If we had to hold writers unblock them. */ if (p->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(p); + + /* + * We're done once the first mount we changed got + * MNT_WRITE_HOLD unset. + */ + if (p == m) + break; } } return err; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 47a53b3362b6..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -4,10 +4,6 @@ config NFS_FS depends on INET && FILE_LOCKING && MULTIUSER select LOCKD select SUNRPC - select CRYPTO - select CRYPTO_HASH - select XXHASH - select CRYPTO_XXHASH select NFS_ACL_SUPPORT if NFS_V3_ACL help Choose Y here if you want to access files residing on other diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bac4cf1a308e..c6b263b5faf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -39,7 +39,7 @@ #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> -#include <linux/xxhash.h> +#include <linux/hash.h> #include "delegation.h" #include "iostat.h" @@ -350,10 +350,7 @@ out: * of directory cookies. Content is addressed by the value of the * cookie index of the first readdir entry in a page. * - * The xxhash algorithm is chosen because it is fast, and is supposed - * to result in a decent flat distribution of hashes. - * - * We then select only the first 18 bits to avoid issues with excessive + * We select only the first 18 bits to avoid issues with excessive * memory use for the page cache XArray. 18 bits should allow the caching * of 262144 pages of sequences of readdir entries. Since each page holds * 127 readdir entries for a typical 64-bit system, that works out to a @@ -363,7 +360,7 @@ static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) { if (cookie == 0) return 0; - return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK; + return hash_64(cookie, 18); } static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, @@ -1991,16 +1988,6 @@ const struct dentry_operations nfs4_dentry_operations = { }; EXPORT_SYMBOL_GPL(nfs4_dentry_operations); -static fmode_t flags_to_mode(int flags) -{ - fmode_t res = (__force fmode_t)flags & FMODE_EXEC; - if ((flags & O_ACCMODE) != O_WRONLY) - res |= FMODE_READ; - if ((flags & O_ACCMODE) != O_RDONLY) - res |= FMODE_WRITE; - return res; -} - static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7eb3b08d702f..b4e46b0ffa2d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1180,7 +1180,6 @@ int nfs_open(struct inode *inode, struct file *filp) nfs_fscache_open_file(inode, filp); return 0; } -EXPORT_SYMBOL_GPL(nfs_open); /* * This function is called whenever some part of NFS notices that diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 57b0497105c8..7eefa16ed381 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -42,6 +42,16 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) return true; } +static inline fmode_t flags_to_mode(int flags) +{ + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} + /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index ad3405c64b9e..e7b34f7e0614 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -997,7 +997,7 @@ int __init nfs4_xattr_cache_init(void) nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", sizeof(struct nfs4_xattr_cache), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), nfs4_xattr_cache_init_once); if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d258933cf8c8..7b861e4f0533 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -32,6 +32,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; + fmode_t f_mode; struct iattr attr; int err; @@ -50,8 +51,9 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (err) return err; + f_mode = filp->f_mode; if ((openflags & O_ACCMODE) == 3) - return nfs_open(inode, filp); + f_mode |= flags_to_mode(openflags); /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); @@ -59,7 +61,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e3f5b380cefe..16106f805ffa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9615,6 +9615,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return ERR_CAST(task); status = rpc_wait_for_completion_task(task); if (status != 0) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 5fa11e1aca4c..6f325e10056c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -347,6 +347,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); + task_setup_data.task = &data->task; task_setup_data.callback_data = data; data->cred = get_current_cred(); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c08882f5867b..2c1b027774d4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -237,6 +237,13 @@ nfsd_file_check_write_error(struct nfsd_file *nf) } static void +nfsd_file_flush(struct nfsd_file *nf) +{ + if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); +} + +static void nfsd_file_do_unhash(struct nfsd_file *nf) { lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); @@ -295,19 +302,15 @@ nfsd_file_put_noref(struct nfsd_file *nf) void nfsd_file_put(struct nfsd_file *nf) { - bool is_hashed; - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) { + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { + nfsd_file_flush(nf); nfsd_file_put_noref(nf); - return; + } else { + nfsd_file_put_noref(nf); + if (nf->nf_file) + nfsd_file_schedule_laundrette(); } - - filemap_flush(nf->nf_file->f_mapping); - is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; - nfsd_file_put_noref(nf); - if (is_hashed) - nfsd_file_schedule_laundrette(); if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) nfsd_file_gc(); } @@ -328,6 +331,7 @@ nfsd_file_dispose_list(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del(&nf->nf_lru); + nfsd_file_flush(nf); nfsd_file_put_noref(nf); } } @@ -341,6 +345,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del(&nf->nf_lru); + nfsd_file_flush(nf); if (!refcount_dec_and_test(&nf->nf_ref)) continue; if (nfsd_file_free(nf)) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 367551bddfc6..b5760801d377 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -249,34 +249,34 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) int w; if (!svcxdr_encode_stat(xdr, resp->status)) - return 0; + return false; if (dentry == NULL || d_really_is_negative(dentry)) - return 1; + return true; inode = d_inode(dentry); if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) - return 0; + return false; if (xdr_stream_encode_u32(xdr, resp->mask) < 0) - return 0; + return false; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!*(rqstp->rq_next_page++)) - return 1; + return true; w -= PAGE_SIZE; } if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, resp->mask & NFS_ACL, 0)) - return 0; + return false; if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, resp->mask & NFS_DFACL, NFS_ACL_DEFAULT)) - return 0; + return false; - return 1; + return true; } /* ACCESS */ @@ -286,17 +286,17 @@ nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nfsd3_accessres *resp = rqstp->rq_resp; if (!svcxdr_encode_stat(xdr, resp->status)) - return 0; + return false; switch (resp->status) { case nfs_ok: if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) - return 0; + return false; if (xdr_stream_encode_u32(xdr, resp->access) < 0) - return 0; + return false; break; } - return 1; + return true; } /* diff --git a/fs/pipe.c b/fs/pipe.c index 9648ac15164a..e140ea150bbb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -804,7 +804,7 @@ struct pipe_inode_info *alloc_pipe_info(void) if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer), + pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { @@ -849,7 +849,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) #endif if (pipe->tmp_page) __free_page(pipe->tmp_page); - kvfree(pipe->bufs); + kfree(pipe->bufs); kfree(pipe); } @@ -1264,7 +1264,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) if (nr_slots < n) return -EBUSY; - bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT); + bufs = kcalloc(nr_slots, sizeof(*bufs), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; @@ -1291,7 +1292,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) head = n; tail = 0; - kvfree(pipe->bufs); + kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 80acb6885cf9..962d32468eb4 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -759,9 +759,14 @@ static void posix_acl_fix_xattr_userns( } void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, + struct inode *inode, void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); + + /* Leave ids untouched on non-idmapped mounts. */ + if (no_idmapping(mnt_userns, i_user_ns(inode))) + mnt_userns = &init_user_ns; if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, @@ -769,9 +774,14 @@ void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, } void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, + struct inode *inode, void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); + + /* Leave ids untouched on non-idmapped mounts. */ + if (no_idmapping(mnt_userns, i_user_ns(inode))) + mnt_userns = &init_user_ns; if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, diff --git a/fs/stat.c b/fs/stat.c index 7f734be0e57e..5c2c94464e8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -348,9 +348,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -359,7 +356,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -367,7 +366,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -377,7 +376,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -665,11 +664,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -679,7 +680,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 42dcf96881b6..a12ac0356c69 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -703,19 +703,6 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) ktype = get_ktype(kobj); if (ktype) { - struct attribute **kattr; - - /* - * Change owner of the default attributes associated with the - * ktype of @kobj. - */ - for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) { - error = sysfs_file_change_owner(kobj, (*kattr)->name, - kuid, kgid); - if (error) - return error; - } - /* * Change owner of the default groups associated with the * ktype of @kobj. diff --git a/fs/xattr.c b/fs/xattr.c index 5c8c5175b385..998045165916 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -569,7 +569,8 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d, } if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size); + posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), + kvalue, size); } error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); @@ -667,7 +668,8 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error); + posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), + kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { |