diff options
Diffstat (limited to 'fs')
85 files changed, 1221 insertions, 520 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 15a99f9c7253..39def020a074 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -500,10 +500,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) } #ifdef CONFIG_9P_FSCACHE - if (v9ses->fscache) { + if (v9ses->fscache) v9fs_cache_session_put_cookie(v9ses); - kfree(v9ses->cachetag); - } + kfree(v9ses->cachetag); #endif kfree(v9ses->uname); kfree(v9ses->aname); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index b79879aacc02..7b784af604fd 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -382,15 +382,17 @@ void afs_dynroot_depopulate(struct super_block *sb) net->dynroot_sb = NULL; mutex_unlock(&net->proc_cells_lock); - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); + if (root) { + inode_lock(root->d_inode); + + /* Remove all the pins for dirs created for manually added cells */ + list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + if (subdir->d_fsdata) { + subdir->d_fsdata = NULL; + dput(subdir); + } } - } - inode_unlock(root->d_inode); + inode_unlock(root->d_inode); + } } diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 24fd163c6323..97cab12b0a6c 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -235,6 +235,7 @@ int afs_put_operation(struct afs_operation *op) afs_end_cursor(&op->ac); afs_put_serverlist(op->net, op->server_list); afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); + key_put(op->key); kfree(op); return ret; } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f2f9086ebe98..b9c658e0548e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra; + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm, vm_munmap(textpos, text_len); goto err; } - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", data_len + bss_len + stack_len, datapos); @@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra; + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm, } realdatastart = textpos + ntohl(hdr->data_start); - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(u32), + FLAT_DATA_ALIGN); reloc = (__be32 __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); @@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm, (text_len + full_data - sizeof(struct flat_hdr)), 0); - if (datapos != realdatastart) - memmove((void *)datapos, (void *)realdatastart, - full_data); + memmove((void *) datapos, (void *) realdatastart, + full_data); #else /* * This is used on MMU systems mainly for testing. @@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm, if (IS_ERR_VALUE(result)) { ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); - vm_munmap(textpos, text_len + data_len + extra); + vm_munmap(textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(u32)); goto err; } } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea10f7bc99ab..ea1c28ccb44f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2303,7 +2303,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc( return NULL; ret->path = btrfs_alloc_path(); - if (!ret) { + if (!ret->path) { kfree(ret); return NULL; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d404cce8ae40..6fdb3392a06d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1059,8 +1059,10 @@ struct btrfs_root { wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; int log_transid; /* No matter the commit succeeds or not*/ @@ -2982,6 +2984,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3194,7 +3198,7 @@ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - if ((errno) != -EIO) { \ + if ((errno) != -EIO && (errno) != -EROFS) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1a148058773..66618a1794ea 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1395,7 +1395,12 @@ alloc_fail: goto out; } -static int btrfs_init_fs_root(struct btrfs_root *root) +/* + * Initialize subvolume root in-memory structure + * + * @anon_dev: anonymous device to attach to the root, if zero, allocate new + */ +static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; unsigned int nofs_flag; @@ -1428,9 +1433,20 @@ static int btrfs_init_fs_root(struct btrfs_root *root) spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); - ret = get_anon_bdev(&root->anon_dev); - if (ret) - goto fail; + /* + * Don't assign anonymous block device to roots that are not exposed to + * userspace, the id pool is limited to 1M + */ + if (is_fstree(root->root_key.objectid) && + btrfs_root_refs(&root->root_item) > 0) { + if (!anon_dev) { + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + } else { + root->anon_dev = anon_dev; + } + } mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, @@ -1534,8 +1550,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) } -struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, bool check_ref) +/* + * Get an in-memory reference of a root structure. + * + * For essential trees like root/extent tree, we grab it from fs_info directly. + * For subvolume trees, we check the cached filesystem roots first. If not + * found, then read it from disk and add it to cached fs roots. + * + * Caller should release the root by calling btrfs_put_root() after the usage. + * + * NOTE: Reloc and log trees can't be read by this function as they share the + * same root objectid. + * + * @objectid: root id + * @anon_dev: preallocated anonymous block device number for new roots, + * pass 0 for new allocation. + * @check_ref: whether to check root item references, If true, return -ENOENT + * for orphan roots + */ +static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev, + bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; @@ -1564,6 +1599,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { + /* Shouldn't get preallocated anon_dev for cached roots */ + ASSERT(!anon_dev); if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); @@ -1583,7 +1620,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root); + ret = btrfs_init_fs_root(root, anon_dev); if (ret) goto fail; @@ -1616,6 +1653,33 @@ fail: return ERR_PTR(ret); } +/* + * Get in-memory reference of a root structure + * + * @objectid: tree objectid + * @check_ref: if set, verify that the tree exists and the item has at least + * one reference + */ +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, bool check_ref) +{ + return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); +} + +/* + * Get in-memory reference of a root structure, created as new, optionally pass + * the anonymous block device id + * + * @objectid: tree objectid + * @anon_dev: if zero, allocate a new anonymous block device or use the + * parameter value + */ +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev) +{ + return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); +} + static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bf43245406c4..00dc39d47ed3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index b6561455b3c4..8bbb734f3f51 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -34,6 +34,8 @@ struct io_failure_record; */ #define CHUNK_ALLOCATED EXTENT_DIRTY #define CHUNK_TRIMMED EXTENT_DEFRAG +#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \ + CHUNK_TRIMMED) enum { IO_TREE_FS_PINNED_EXTENTS, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0bc35f932bf..de6fe176fdfb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "rcu-string.h" #undef SCRAMBLE_DELAYED_REFS @@ -5298,7 +5299,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction start. See + * wait_reserve_ticket and the whole reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -5466,6 +5474,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } } + /* + * This subvolume is going to be completely dropped, and won't be + * recorded as dirty roots, thus pertrans meta rsv will not be freed at + * commit transaction time. So free it here manually. + */ + btrfs_qgroup_convert_reserved_meta(root, INT_MAX); + btrfs_qgroup_free_meta_all_pertrans(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else @@ -5653,6 +5669,19 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + /* Check if there are any CHUNK_* bits left */ + if (start > device->total_bytes) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_in_rcu(fs_info, +"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", + start, end - start + 1, + rcu_str_deref(device->name), + device->total_bytes); + mutex_unlock(&fs_info->chunk_mutex); + ret = 0; + break; + } + /* Ensure we skip the reserved area in the first 1M */ start = max_t(u64, start, SZ_1M); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60278e52c37a..8ba8788461ae 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4127,7 +4127,7 @@ retry: if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { ret = flush_write_bio(&epd); } else { - ret = -EUCLEAN; + ret = -EROFS; end_write_bio(&epd, ret); } return ret; @@ -4502,20 +4502,32 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) free_extent_map(em); break; } - if (!test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) { + if (test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED, 0, NULL)) + goto next; + /* + * If it's not in the list of modified extents, used + * by a fast fsync, we can remove it. If it's being + * logged we can safely remove it since fsync took an + * extra reference on the em. + */ + if (list_empty(&em->list) || + test_bit(EXTENT_FLAG_LOGGING, &em->flags)) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &btrfs_inode->runtime_flags); remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); } +next: start = extent_map_end(em); write_unlock(&map->lock); /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(tree, page, mask); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b0d2c976587e..1523aa4eaff0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1532,8 +1532,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1648,8 +1648,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(BTRFS_I(inode), pos, - &write_bytes, false) > 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, + &write_bytes, false) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1928,8 +1928,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, - true) <= 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, + true) <= 0) { inode_unlock(inode); return -EAGAIN; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 55955bd424d7..6f7b6bca6dc5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2281,7 +2281,7 @@ out: static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *left_info; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *right_info; bool merged = false; u64 offset = info->offset; @@ -2297,7 +2297,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (right_info && rb_prev(&right_info->offset_index)) left_info = rb_entry(rb_prev(&right_info->offset_index), struct btrfs_free_space, offset_index); - else + else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); /* See try_merge_free_space() comment. */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6862cd7e21a9..7ba1218b1630 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -650,12 +650,18 @@ cont: page_error_op | PAGE_END_WRITEBACK); - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); + /* + * Ensure we only free the compressed pages if we have + * them allocated, as we can still reach here with + * inode_need_compress() == false. + */ + if (pages) { + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); } - kfree(pages); - return 0; } } @@ -4041,6 +4047,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) } } + free_anon_bdev(dest->anon_dev); + dest->anon_dev = 0; out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; @@ -4511,11 +4519,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -4527,11 +4537,27 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) - goto out; + ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, + blocksize); + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + btrfs_check_can_nocow(BTRFS_I(inode), block_start, + &write_bytes, false) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); + goto out; + } again: page = find_or_create_page(mapping, index, mask); if (!page) { @@ -4600,14 +4626,26 @@ again: set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); + out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: + if (only_release_metadata) + btrfs_drew_write_unlock(&BTRFS_I(inode)->root->snapshot_lock); extent_changeset_free(data_reserved); return ret; } @@ -6595,7 +6633,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - ret = -EUCLEAN; + err = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8f7c5f00894..1448bc43561c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } -/* Check if @flags are a supported and valid set of FS_*_FL flags */ -static int check_fsflags(unsigned int flags) +/* + * Check if @flags are a supported and valid set of FS_*_FL flags and that + * the old and new flags are not conflicting + */ +static int check_fsflags(unsigned int old_flags, unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ @@ -174,9 +177,19 @@ static int check_fsflags(unsigned int flags) FS_NOCOW_FL)) return -EOPNOTSUPP; + /* COMPR and NOCOMP on new/old are valid */ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) return -EINVAL; + if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) + return -EINVAL; + + /* NOCOW and compression options are mutually exclusive */ + if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + return 0; } @@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags = binode->flags; + u32 binode_flags; if (!inode_owner_or_capable(inode)) return -EPERM; @@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (copy_from_user(&fsflags, arg, sizeof(fsflags))) return -EFAULT; - ret = check_fsflags(fsflags); - if (ret) - return ret; - ret = mnt_want_write_file(file); if (ret) return ret; inode_lock(inode); - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); if (ret) goto out_unlock; + ret = check_fsflags(old_fsflags, fsflags); + if (ret) + goto out_unlock; + + binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; else @@ -566,6 +580,7 @@ static noinline int create_subvol(struct inode *dir, struct inode *inode; int ret; int err; + dev_t anon_dev = 0; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; @@ -578,6 +593,10 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail_free; + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto fail_free; + /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. @@ -660,12 +679,15 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_get_fs_root(fs_info, objectid, true); + new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { + free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); goto fail; } + /* Freeing will be done in btrfs_put_root() of new_root */ + anon_dev = 0; btrfs_record_root_in_trans(trans, new_root); @@ -735,6 +757,8 @@ fail: return ret; fail_free: + if (anon_dev) + free_anon_bdev(anon_dev); kfree(root_item); return ret; } @@ -762,6 +786,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!pending_snapshot) return -ENOMEM; + ret = get_anon_bdev(&pending_snapshot->anon_dev); + if (ret < 0) + goto free_pending; pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), GFP_KERNEL); pending_snapshot->path = btrfs_alloc_path(); @@ -823,10 +850,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; + pending_snapshot->anon_dev = 0; fail: + /* Prevent double freeing of anon_dev */ + if (ret && pending_snapshot->snap) + pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); free_pending: + if (pending_snapshot->anon_dev) + free_anon_bdev(pending_snapshot->anon_dev); kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); kfree(pending_snapshot); @@ -3198,11 +3231,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 flags_in; int ret = 0; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); - if (!fi_args) - return -ENOMEM; + fi_args = memdup_user(arg, sizeof(*fi_args)); + if (IS_ERR(fi_args)) + return PTR_ERR(fi_args); + + flags_in = fi_args->flags; + memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; @@ -3218,6 +3255,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; + if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { + fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); + fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); + fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index af92525dbb16..7f03dbe5b609 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, exist_re = insert_root_entry(&exist->roots, re); if (exist_re) kfree(re); + } else { + kfree(re); } kfree(be); return exist; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3bbae80c752f..5740ed51a1e8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + /* + * In merge_reloc_root(), we modify the upper level pointer to swap the + * tree blocks between reloc tree and subvolume tree. Thus for tree + * block COW, we COW at most from level 1 to root level for each tree. + * + * Thus the needed metadata size is at most root_level * nodesize, + * and * 2 since we have two trees to COW. + */ + min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { err = ret; goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 016a025e36c7..5f5b21e389db 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3758,7 +3758,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - return -EIO; + return -EROFS; /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c7bd3fdd7792..475968ccbd1d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -468,8 +468,8 @@ again: "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", cache->start, cache->length, cache->used, cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); + btrfs_dump_free_space(cache, bytes); } if (++index < BTRFS_NR_RAID_TYPES) goto again; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c3826ae883f0..56cd2cf57158 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -449,6 +449,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, char *compress_type; bool compress_force = false; enum btrfs_compression_type saved_compress_type; + int saved_compress_level; bool saved_compress_force; int no_compress = 0; @@ -531,6 +532,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, info->compress_type : BTRFS_COMPRESS_NONE; saved_compress_force = btrfs_test_opt(info, FORCE_COMPRESS); + saved_compress_level = info->compress_level; if (token == Opt_compress || token == Opt_compress_force || strncmp(args[0].from, "zlib", 4) == 0) { @@ -575,6 +577,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; + info->compress_level = 0; + info->compress_type = 0; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; @@ -595,11 +599,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } - if ((btrfs_test_opt(info, COMPRESS) && - (info->compress_type != saved_compress_type || - compress_force != saved_compress_force)) || - (!btrfs_test_opt(info, COMPRESS) && - no_compress == 1)) { + if (no_compress == 1) { + btrfs_info(info, "use no compression"); + } else if ((info->compress_type != saved_compress_type) || + (compress_force != saved_compress_force) || + (info->compress_level != saved_compress_level)) { btrfs_info(info, "%s %s compression, level %d", (compress_force) ? "force" : "use", compress_type, info->compress_level); @@ -1312,6 +1316,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); const char *compress_type; + const char *subvol_name; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1398,8 +1403,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",ref_verify"); seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); - seq_puts(seq, ",subvol="); - seq_dentry(seq, dentry, " \t\n\\"); + subvol_name = btrfs_get_subvol_name_from_objectid(info, + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + if (!IS_ERR(subvol_name)) { + seq_puts(seq, ",subvol="); + seq_escape(seq, subvol_name, " \t\n\\"); + kfree(subvol_name); + } return 0; } @@ -1887,6 +1897,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) set_bit(BTRFS_FS_OPEN, &fs_info->flags); } out: + /* + * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, + * since the absence of the flag means it can be toggled off by remount. + */ + *flags |= SB_I_VERSION; + wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); return 0; @@ -2296,9 +2312,7 @@ static int btrfs_unfreeze(struct super_block *sb) static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_fs_devices *cur_devices; struct btrfs_device *dev, *first_dev = NULL; - struct list_head *head; /* * Lightweight locking of the devices. We should not need @@ -2308,18 +2322,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * least until the rcu_read_unlock. */ rcu_read_lock(); - cur_devices = fs_info->fs_devices; - while (cur_devices) { - head = &cur_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) - continue; - if (!dev->name) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - cur_devices = cur_devices->seed; + list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!dev->name) + continue; + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; } if (first_dev) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a39bff64ff24..abc4a8fd6df6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1273,7 +1273,9 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, { int error = 0; struct btrfs_device *dev; + unsigned int nofs_flag; + nofs_flag = memalloc_nofs_save(); list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (one_device && one_device != dev) @@ -1301,6 +1303,7 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, break; } } + memalloc_nofs_restore(nofs_flag); return error; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b359d4b17658..2710f8ddb95f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); - err = -EIO; + if (TRANS_ABORTED(trans)) + err = trans->aborted; + else + err = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -1630,7 +1633,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_fs_root(fs_info, objectid, true); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index bf102e64bfb2..a122a712f5cc 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -151,6 +151,8 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reservation for relocation */ int error; + /* Preallocated anonymous block device number */ + dev_t anon_dev; bool readonly; struct list_head list; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cd5348f352dd..d22ff1e0963c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3116,29 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_batch); - atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; - mutex_unlock(&log_root_tree->log_mutex); - - mutex_lock(&log_root_tree->log_mutex); - /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit * open until we drop the log_mutex. */ ret = update_log_root(trans, log, &new_root_item); - - if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* atomic_dec_and_test implies a barrier */ - cond_wake_up_nomb(&log_root_tree->log_writer_wait); - } - if (ret) { if (!list_empty(&root_log_ctx.list)) list_del_init(&root_log_ctx.list); @@ -3184,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root_log_ctx.log_transid - 1); } - wait_for_writer(log_root_tree); - /* * now that we've moved on to the tree of log tree roots, * check the full commit flag again @@ -4041,11 +4027,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); - if (ret) { - btrfs_release_path(dst_path); - kfree(ins_data); - return ret; - } + if (ret) + break; } } } @@ -4058,7 +4041,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ - ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, @@ -5123,14 +5105,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, const loff_t end, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_path *dst_path; struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; int err = 0; - int ret; + int ret = 0; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5166,15 +5147,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.offset = (u64)-1; /* - * Only run delayed items if we are a dir or a new file. - * Otherwise commit the delayed inode only, which is needed in - * order for the log replay code to mark inodes for link count - * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + * Only run delayed items if we are a directory. We want to make sure + * all directory indexes hit the fs/subvolume tree so we can find them + * and figure out which index ranges have to be logged. + * + * Otherwise commit the delayed inode only if the full sync flag is set, + * as we want to make sure an up to date version is in the subvolume + * tree so copy_inode_items_to_log() / copy_items() can find it and copy + * it to the log tree. For a non full sync, we always log the inode item + * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode) || - inode->generation > fs_info->last_trans_committed) + if (S_ISDIR(inode->vfs_inode.i_mode)) ret = btrfs_commit_inode_delayed_items(trans, inode); - else + else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); if (ret) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f403fb1e6d37..0fecf1e4d8f6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -245,7 +245,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * * global::fs_devs - add, remove, updates to the global list * - * does not protect: manipulation of the fs_devices::devices list! + * does not protect: manipulation of the fs_devices::devices list in general + * but in mount context it could be used to exclude list modifications by eg. + * scan ioctl * * btrfs_device::name - renames (write side), read is RCU * @@ -258,6 +260,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * + * Is not required at mount and close times, because our device list is + * protected by the uuid_mutex at that point. + * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -602,6 +607,11 @@ static int btrfs_free_stale_devices(const char *path, return ret; } +/* + * This is only used on mount, and we are protected from competing things + * messing with our fs_devices by the uuid_mutex, thus we do not need the + * fs_devices->device_list_mutex here. + */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) @@ -1229,8 +1239,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int ret; lockdep_assert_held(&uuid_mutex); + /* + * The device_list_mutex cannot be taken here in case opening the + * underlying device takes further locks like bd_mutex. + * + * We also don't need the lock here as this is called during mount and + * exclusion is provided by uuid_mutex + */ - mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; ret = 0; @@ -1238,7 +1254,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, list_sort(NULL, &fs_devices->devices, devid_cmp); ret = open_fs_devices(fs_devices, flags, holder); } - mutex_unlock(&fs_devices->device_list_mutex); return ret; } @@ -3231,7 +3246,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); @@ -4135,7 +4150,22 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, mutex_lock(&fs_info->balance_mutex); if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) btrfs_info(fs_info, "balance: paused"); - else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + /* + * Balance can be canceled by: + * + * - Regular cancel request + * Then ret == -ECANCELED and balance_cancel_req > 0 + * + * - Fatal signal to "btrfs" process + * Either the signal caught by wait_reserve_ticket() and callers + * got -EINTR, or caught by btrfs_should_cancel_balance() and + * got -ECANCELED. + * Either way, in this case balance_cancel_req = 0, and + * ret == -EINTR or ret == -ECANCELED. + * + * So here we only check the return value to catch canceled balance. + */ + else if (ret == -ECANCELED || ret == -EINTR) btrfs_info(fs_info, "balance: canceled"); else btrfs_info(fs_info, "balance: ended with status: %d", ret); @@ -4690,6 +4720,10 @@ again: } mutex_lock(&fs_info->chunk_mutex); + /* Clear all state bits beyond the shrunk device size */ + clear_extent_bits(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK); + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) list_add_tail(&device->post_commit_list, @@ -7049,7 +7083,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * otherwise we don't need it. */ mutex_lock(&uuid_mutex); - mutex_lock(&fs_info->chunk_mutex); /* * It is possible for mount and umount to race in such a way that @@ -7094,7 +7127,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + mutex_lock(&fs_info->chunk_mutex); ret = read_one_chunk(&found_key, leaf, chunk); + mutex_unlock(&fs_info->chunk_mutex); if (ret) goto error; } @@ -7124,7 +7159,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) } ret = 0; error: - mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&uuid_mutex); btrfs_free_path(path); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 39f5311404b0..060bdcc5ce32 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -930,6 +930,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a50497142e59..946f9a92658a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3279,8 +3279,10 @@ static void handle_session(struct ceph_mds_session *session, goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_64_safe(&p, end, features, bad); - p += len - sizeof(features); + if (len) { + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); + } } mutex_lock(&mdsc->mutex); @@ -4359,7 +4361,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) goto err_mdsc; } - fsc->mdsc = mdsc; init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -4414,6 +4415,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); + + fsc->mdsc = mdsc; return 0; err_mdsmap: diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index b9db73687eaa..eba01d0908dd 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -115,6 +115,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, vars->oparms.fid = &fid; vars->oparms.reconnect = false; vars->oparms.mode = mode; + vars->oparms.cifs_sb = cifs_sb; rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 157992864ce7..d88e2683626e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -508,15 +508,31 @@ cifs_ses_oplock_break(struct work_struct *work) kfree(lw); } +static void +smb2_queue_pending_open_break(struct tcon_link *tlink, __u8 *lease_key, + __le32 new_lease_state) +{ + struct smb2_lease_break_work *lw; + + lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); + if (!lw) { + cifs_put_tlink(tlink); + return; + } + + INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); + lw->tlink = tlink; + lw->lease_state = new_lease_state; + memcpy(lw->lease_key, lease_key, SMB2_LEASE_KEY_SIZE); + queue_work(cifsiod_wq, &lw->lease_break); +} + static bool -smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, - struct smb2_lease_break_work *lw) +smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp) { - bool found; __u8 lease_state; struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifs_pending_open *open; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); @@ -546,22 +562,29 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cfile->oplock_level = lease_state; cifs_queue_oplock_break(cfile); - kfree(lw); return true; } - found = false; + return false; +} + +static struct cifs_pending_open * +smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon, + struct smb2_lease_break *rsp) +{ + __u8 lease_state = le32_to_cpu(rsp->NewLeaseState); + int ack_req = le32_to_cpu(rsp->Flags & + SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); + struct cifs_pending_open *open; + struct cifs_pending_open *found = NULL; + list_for_each_entry(open, &tcon->pending_opens, olist) { if (memcmp(open->lease_key, rsp->LeaseKey, SMB2_LEASE_KEY_SIZE)) continue; if (!found && ack_req) { - found = true; - memcpy(lw->lease_key, open->lease_key, - SMB2_LEASE_KEY_SIZE); - lw->tlink = cifs_get_tlink(open->tlink); - queue_work(cifsiod_wq, &lw->lease_break); + found = open; } cifs_dbg(FYI, "found in the pending open list\n"); @@ -582,14 +605,7 @@ smb2_is_valid_lease_break(char *buffer) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - struct smb2_lease_break_work *lw; - - lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); - if (!lw) - return false; - - INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); - lw->lease_state = rsp->NewLeaseState; + struct cifs_pending_open *open; cifs_dbg(FYI, "Checking for lease break\n"); @@ -607,11 +623,27 @@ smb2_is_valid_lease_break(char *buffer) spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); - if (smb2_tcon_has_lease(tcon, rsp, lw)) { + if (smb2_tcon_has_lease(tcon, rsp)) { spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } + open = smb2_tcon_find_pending_open_lease(tcon, + rsp); + if (open) { + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; + struct tcon_link *tlink; + + tlink = cifs_get_tlink(open->tlink); + memcpy(lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + smb2_queue_pending_open_break(tlink, + lease_key, + rsp->NewLeaseState); + return true; + } spin_unlock(&tcon->open_file_lock); if (tcon->crfid.is_valid && @@ -629,7 +661,6 @@ smb2_is_valid_lease_break(char *buffer) } } spin_unlock(&cifs_tcp_ses_lock); - kfree(lw); cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); return false; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2f4cdd290c46..492688764004 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1387,6 +1387,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) spnego_key = cifs_get_spnego_key(ses); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); + if (rc == -ENOKEY) + cifs_dbg(VFS, "Verify user has a krb5 ticket and keyutils is installed\n"); spnego_key = NULL; goto out; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index e93670ecfae5..624617c12250 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -622,6 +622,9 @@ static int new_lockspace(const char *name, const char *cluster, wait_event(ls->ls_recover_lock_wait, test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + /* let kobject handle freeing of ls if there's an error */ + do_unreg = 1; + ls->ls_kobj.kset = dlm_kset; error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, "%s", ls->ls_name); @@ -629,9 +632,6 @@ static int new_lockspace(const char *name, const char *cluster, goto out_recoverd; kobject_uevent(&ls->ls_kobj, KOBJ_ADD); - /* let kobject handle freeing of ls if there's an error */ - do_unreg = 1; - /* This uevent triggers dlm_controld in userspace to add us to the group of nodes that are members of this lockspace (managed by the cluster infrastructure.) Once it's done that, it tells us who the diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 7dd4bbe9674f..586f9d0a8b2f 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -8,31 +8,80 @@ #include <trace/events/erofs.h> -/* no locking */ -static int erofs_read_inode(struct inode *inode, void *data) +/* + * if inode is successfully read, return its inode page (or sometimes + * the inode payload page if it's an extended inode) in order to fill + * inline data if possible. + */ +static struct page *erofs_read_inode(struct inode *inode, + unsigned int *ofs) { + struct super_block *sb = inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); - struct erofs_inode_compact *dic = data; - struct erofs_inode_extended *die; + const erofs_off_t inode_loc = iloc(sbi, vi->nid); + + erofs_blk_t blkaddr, nblks = 0; + struct page *page; + struct erofs_inode_compact *dic; + struct erofs_inode_extended *die, *copied = NULL; + unsigned int ifmt; + int err; - const unsigned int ifmt = le16_to_cpu(dic->i_format); - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - erofs_blk_t nblks = 0; + blkaddr = erofs_blknr(inode_loc); + *ofs = erofs_blkoff(inode_loc); - vi->datalayout = erofs_inode_datalayout(ifmt); + erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", + __func__, vi->nid, *ofs, blkaddr); + + page = erofs_get_meta_page(sb, blkaddr); + if (IS_ERR(page)) { + erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(page)); + return page; + } + dic = page_address(page) + *ofs; + ifmt = le16_to_cpu(dic->i_format); + + vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", vi->datalayout, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_out; } switch (erofs_inode_version(ifmt)) { case EROFS_INODE_LAYOUT_EXTENDED: - die = data; - vi->inode_isize = sizeof(struct erofs_inode_extended); + /* check if the inode acrosses page boundary */ + if (*ofs + vi->inode_isize <= PAGE_SIZE) { + *ofs += vi->inode_isize; + die = (struct erofs_inode_extended *)dic; + } else { + const unsigned int gotten = PAGE_SIZE - *ofs; + + copied = kmalloc(vi->inode_isize, GFP_NOFS); + if (!copied) { + err = -ENOMEM; + goto err_out; + } + memcpy(copied, dic, gotten); + unlock_page(page); + put_page(page); + + page = erofs_get_meta_page(sb, blkaddr + 1); + if (IS_ERR(page)) { + erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld", + vi->nid, PTR_ERR(page)); + kfree(copied); + return page; + } + *ofs = vi->inode_isize - gotten; + memcpy((u8 *)copied + gotten, page_address(page), *ofs); + die = copied; + } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); @@ -69,9 +118,12 @@ static int erofs_read_inode(struct inode *inode, void *data) /* total blocks for compressed files */ if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(die->i_u.compressed_blocks); + + kfree(copied); break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); + *ofs += vi->inode_isize; vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); @@ -111,8 +163,8 @@ static int erofs_read_inode(struct inode *inode, void *data) erofs_err(inode->i_sb, "unsupported on-disk inode version %u of nid %llu", erofs_inode_version(ifmt), vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_out; } if (!nblks) @@ -120,13 +172,18 @@ static int erofs_read_inode(struct inode *inode, void *data) inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; else inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; - return 0; + return page; bogusimode: erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode, vi->nid); + err = -EFSCORRUPTED; +err_out: DBG_BUGON(1); - return -EFSCORRUPTED; + kfree(copied); + unlock_page(page); + put_page(page); + return ERR_PTR(err); } static int erofs_fill_symlink(struct inode *inode, void *data, @@ -146,7 +203,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, if (!lnk) return -ENOMEM; - m_pofs += vi->inode_isize + vi->xattr_isize; + m_pofs += vi->xattr_isize; /* inline symlink data shouldn't cross page boundary as well */ if (m_pofs + inode->i_size > PAGE_SIZE) { kfree(lnk); @@ -167,37 +224,17 @@ static int erofs_fill_symlink(struct inode *inode, void *data, static int erofs_fill_inode(struct inode *inode, int isdir) { - struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); struct page *page; - void *data; - int err; - erofs_blk_t blkaddr; unsigned int ofs; - erofs_off_t inode_loc; + int err = 0; trace_erofs_fill_inode(inode, isdir); - inode_loc = iloc(EROFS_SB(sb), vi->nid); - blkaddr = erofs_blknr(inode_loc); - ofs = erofs_blkoff(inode_loc); - - erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", - __func__, vi->nid, ofs, blkaddr); - page = erofs_get_meta_page(sb, blkaddr); - - if (IS_ERR(page)) { - erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(page)); + /* read inode base data from disk */ + page = erofs_read_inode(inode, &ofs); + if (IS_ERR(page)) return PTR_ERR(page); - } - - DBG_BUGON(!PageUptodate(page)); - data = page_address(page); - - err = erofs_read_inode(inode, data + ofs); - if (err) - goto out_unlock; /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -210,7 +247,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) inode->i_fop = &erofs_dir_fops; break; case S_IFLNK: - err = erofs_fill_symlink(inode, data, ofs); + err = erofs_fill_symlink(inode, page_address(page), ofs); if (err) goto out_unlock; inode_nohighmem(inode); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 12eebcdea9c8..e0decff22ae2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1994,9 +1994,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) * not already there, and calling reverse_path_check() * during ep_insert(). */ - if (list_empty(&epi->ffd.file->f_tfile_llink)) + if (list_empty(&epi->ffd.file->f_tfile_llink)) { + get_file(epi->ffd.file); list_add(&epi->ffd.file->f_tfile_llink, &tfile_check_list); + } } } mutex_unlock(&ep->mtx); @@ -2040,6 +2042,7 @@ static void clear_tfile_check_list(void) file = list_first_entry(&tfile_check_list, struct file, f_tfile_llink); list_del_init(&file->f_tfile_llink); + fput(file); } INIT_LIST_HEAD(&tfile_check_list); } @@ -2200,25 +2203,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) { - clear_tfile_check_list(); + if (ep_loop_check(ep, tf.file) != 0) goto error_tgt_fput; - } - } else + } else { + get_file(tf.file); list_add(&tf.file->f_tfile_llink, &tfile_check_list); + } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); - if (error) { -out_del: - list_del(&tf.file->f_tfile_llink); + if (error) goto error_tgt_fput; - } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = epoll_mutex_lock(&tep->mtx, 1, nonblock); if (error) { mutex_unlock(&ep->mtx); - goto out_del; + goto error_tgt_fput; } } } @@ -2239,8 +2239,6 @@ out_del: error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; - if (full_check) - clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -2263,8 +2261,10 @@ out_del: mutex_unlock(&ep->mtx); error_tgt_fput: - if (full_check) + if (full_check) { + clear_tfile_check_list(); mutex_unlock(&epmutex); + } fdput(tf); error_fput: diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index fda7d3f5b4be..432c3febea6d 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -80,6 +80,7 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir) if (dir) le16_add_cpu(&desc->bg_used_dirs_count, -1); spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); + percpu_counter_inc(&EXT2_SB(sb)->s_freeinodes_counter); if (dir) percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); mark_buffer_dirty(bh); @@ -528,7 +529,7 @@ got: goto fail; } - percpu_counter_add(&sbi->s_freeinodes_counter, -1); + percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 16e9b2fda03a..e830a9d4e10d 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -24,6 +24,7 @@ struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; + u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; @@ -45,7 +46,8 @@ void ext4_exit_system_zone(void) static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { - if ((entry1->start_blk + entry1->count) == entry2->start_blk) + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) return 1; return 0; } @@ -66,9 +68,9 @@ static void release_system_zone(struct ext4_system_blocks *system_blks) */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, u32 ino) { - struct ext4_system_zone *new_entry = NULL, *entry; + struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; @@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; - else { - if (start_blk + count > (entry->start_blk + - entry->count)) - entry->count = (start_blk + count - - entry->start_blk); - new_node = *n; - new_entry = rb_entry(new_node, struct ext4_system_zone, - node); - break; - } + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; } - if (!new_entry) { - new_entry = kmem_cache_alloc(ext4_system_zone_cachep, - GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->start_blk = start_blk; - new_entry->count = count; - new_node = &new_entry->node; - - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &system_blks->root); - } + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_entry->ino = ino; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &system_blks->root); /* Can we merge to the left? */ node = rb_prev(new_node); @@ -159,7 +152,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, ino_t ino) { struct ext4_system_zone *entry; struct rb_node *n; @@ -180,7 +173,7 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; else - return 0; + return entry->ino == ino; } return 1; } @@ -214,19 +207,18 @@ static int ext4_protect_reserved_inode(struct super_block *sb, if (n == 0) { i++; } else { - if (!ext4_data_block_valid_rcu(sbi, system_blks, - map.m_pblk, n)) { - err = -EFSCORRUPTED; - __ext4_error(sb, __func__, __LINE__, -err, - map.m_pblk, "blocks %llu-%llu " - "from inode %u overlap system zone", - map.m_pblk, - map.m_pblk + map.m_len - 1, ino); + err = add_system_zone(system_blks, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + __ext4_error(sb, __func__, __LINE__, + -err, map.m_pblk, + "blocks %llu-%llu from inode %u overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1, + ino); + } break; } - err = add_system_zone(system_blks, map.m_pblk, n); - if (err < 0) - break; i += n; } } @@ -280,19 +272,19 @@ int ext4_setup_system_zone(struct super_block *sb) ((i < 5) || ((i % flex_size) == 0))) add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1); + ext4_bg_num_gdb(sb, i) + 1, 0); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, - ext4_block_bitmap(sb, gdp), 1); + ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, - ext4_inode_bitmap(sb, gdp), 1); + ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), - sbi->s_itb_per_group); + sbi->s_itb_per_group, 0); if (ret) goto err; } @@ -341,7 +333,7 @@ void ext4_release_system_zone(struct super_block *sb) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } -int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_system_blocks *system_blks; @@ -353,9 +345,9 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, * mount option. */ rcu_read_lock(); - system_blks = rcu_dereference(sbi->system_blks); - ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, - count); + system_blks = rcu_dereference(EXT4_SB(inode->i_sb)->system_blks); + ret = ext4_data_block_valid_rcu(EXT4_SB(inode->i_sb), system_blks, + start_blk, count, inode->i_ino); rcu_read_unlock(); return ret; } @@ -374,8 +366,7 @@ int ext4_check_blockref(const char *function, unsigned int line, while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 42f5060f3cdf..42815304902b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3363,9 +3363,9 @@ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 221f240eae60..d75054570e44 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -340,7 +340,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) */ if (lblock + len <= lblock) return 0; - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); + return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, @@ -348,7 +348,7 @@ static int ext4_valid_extent_idx(struct inode *inode, { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); + return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, @@ -507,14 +507,10 @@ __read_extent_tree_block(const char *function, unsigned int line, } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; - if (!ext4_has_feature_journal(inode->i_sb) || - (inode->i_ino != - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) { - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); - if (err) - goto errout; - } + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a01e31a032c..8f742b53f1d4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -428,6 +428,10 @@ restart: */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || !ext4_overwrite_io(inode, offset, count))) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index be2b66eb65f7..402641825712 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -858,8 +858,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { + if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); @@ -1004,8 +1003,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { + if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 10dd470876b3..92573f8540ab 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -394,8 +394,7 @@ static int __check_block_validity(struct inode *inode, const char *func, (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, - map->m_len)) { + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, @@ -4760,7 +4759,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = 0; if (ei->i_file_acl && - !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c0a331e2feb0..38719c156573 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3090,7 +3090,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (!ext4_data_block_valid(sbi, block, len)) { + if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error @@ -4915,7 +4915,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_data_block_valid(sbi, block, count)) { + !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 56738b538ddf..a91a5bb8c3a2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1396,8 +1396,8 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, + buf_size, offset)) return -1; *res_dir = de; return 1; @@ -1858,7 +1858,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, blocksize, hinfo, map); map -= count; dx_sort_map(map, count); - /* Split the existing block in the middle, size-wise */ + /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { @@ -1868,8 +1868,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, size += map[i].size; move++; } - /* map index at which we will split */ - split = count - move; + /* + * map index at which we will split + * + * If the sum of active entries didn't exceed half the block size, just + * split it in half by count; each resulting block will have at least + * half the space free. + */ + if (i > 0) + split = count - move; + else + split = count/2; + hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", @@ -2472,7 +2482,7 @@ int ext4_generic_delete_entry(handle_t *handle, de = (struct ext4_dir_entry_2 *)entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, bh->b_size, i)) + entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 1e02a8c106b0..2390f7943f6c 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1310,6 +1310,12 @@ retry_write: congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); lock_page(cc->rpages[i]); + + if (!PageDirty(cc->rpages[i])) { + unlock_page(cc->rpages[i]); + continue; + } + clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } @@ -1353,6 +1359,8 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); cops->destroy_compress_ctx(cc); + kfree(cc->cpages); + cc->cpages = NULL; if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 326c63879ddc..6e9017e6a819 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3432,6 +3432,10 @@ static int f2fs_write_end(struct file *file, if (f2fs_compressed_file(inode) && fsdata) { f2fs_compress_write_end(inode, fsdata, page->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) + f2fs_i_size_write(inode, pos + copied); return copied; } #endif diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 03e24df1c84f..e61ce7fb0958 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1924,8 +1924,12 @@ continue_unlock: goto continue_unlock; } - /* flush inline_data, if it's async context. */ - if (do_balance && is_inline_node(page)) { + /* flush inline_data/inode, if it's async context. */ + if (!do_balance) + goto write_node; + + /* flush inline_data */ + if (is_inline_node(page)) { clear_inline_node(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); @@ -1938,7 +1942,7 @@ continue_unlock: if (flush_dirty_inode(page)) goto lock_node; } - +write_node: f2fs_wait_on_page_writeback(page, NODE, true, true); if (!clear_page_dirty_for_io(page)) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index bbfe18c07417..f7e3304b7802 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -657,6 +657,9 @@ static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, unsigned long ra_pages = sb->s_bdi->ra_pages; unsigned int reada_blocks; + if (fatent->entry >= ent_limit) + return; + if (ra_pages > sb->s_bdi->io_pages) ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6306eaae378b..6d2ea788d0a1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1351,9 +1351,15 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi return ret; } +/* + * NOTE: Never call gfs2_block_zero_range with an open transaction because it + * uses iomap write to perform its actions, which begin their own transactions + * (iomap_begin, page_prepare, etc.) + */ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { + BUG_ON(current->journal_info); return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); } @@ -1414,6 +1420,16 @@ static int trunc_start(struct inode *inode, u64 newsize) u64 oldsize = inode->i_size; int error; + if (!gfs2_is_stuffed(ip)) { + unsigned int blocksize = i_blocksize(inode); + unsigned int offs = newsize & (blocksize - 1); + if (offs) { + error = gfs2_block_zero_range(inode, newsize, + blocksize - offs); + if (error) + return error; + } + } if (journaled) error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); else @@ -1427,19 +1443,10 @@ static int trunc_start(struct inode *inode, u64 newsize) gfs2_trans_add_meta(ip->i_gl, dibh); - if (gfs2_is_stuffed(ip)) { + if (gfs2_is_stuffed(ip)) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); - } else { - unsigned int blocksize = i_blocksize(inode); - unsigned int offs = newsize & (blocksize - 1); - if (offs) { - error = gfs2_block_zero_range(inode, newsize, - blocksize - offs); - if (error) - goto out; - } + else ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; - } i_size_write(inode, newsize); ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); @@ -2448,25 +2455,7 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t start, end; int error; - start = round_down(offset, blocksize); - end = round_up(offset + length, blocksize) - 1; - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (error) - return error; - - if (gfs2_is_jdata(ip)) - error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, - GFS2_JTRUNC_REVOKES); - else - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; - - if (gfs2_is_stuffed(ip)) { - error = stuffed_zero_range(inode, offset, length); - if (error) - goto out; - } else { + if (!gfs2_is_stuffed(ip)) { unsigned int start_off, end_len; start_off = offset & (blocksize - 1); @@ -2489,6 +2478,26 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) } } + start = round_down(offset, blocksize); + end = round_up(offset + length, blocksize) - 1; + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + + if (gfs2_is_jdata(ip)) + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, + GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + if (gfs2_is_stuffed(ip)) { + error = stuffed_zero_range(inode, offset, length); + if (error) + goto out; + } + if (gfs2_is_jdata(ip)) { BUG_ON(!current->journal_info); gfs2_journaled_truncate_range(inode, offset, length); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8545024a1401..f92876f4f37a 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -790,9 +790,11 @@ static void gfs2_glock_poke(struct gfs2_glock *gl) struct gfs2_holder gh; int error; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, flags, &gh); + gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh); + error = gfs2_glock_nq(&gh); if (!error) gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); } static bool gfs2_try_evict(struct gfs2_glock *gl) diff --git a/fs/io_uring.c b/fs/io_uring.c index 493e5047e67c..26978630378e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -669,12 +669,12 @@ struct io_kiocb { * restore the work, if needed. */ struct { - struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; }; struct io_wq_work work; }; + struct callback_head task_work; }; #define IO_PLUG_THRESHOLD 2 @@ -1549,12 +1549,9 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1568,13 +1565,29 @@ static void io_fail_links(struct io_kiocb *req) io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); + link->flags |= REQ_F_COMP_LOCKED; __io_double_put_req(link); } req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +} + +static void io_fail_links(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + io_cqring_ev_posted(ctx); } @@ -1747,6 +1760,17 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; +} + static void io_iopoll_queue(struct list_head *again) { struct io_kiocb *req; @@ -1936,6 +1960,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } @@ -2661,8 +2686,10 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (req->file->f_op->read_iter) ret2 = call_read_iter(req->file, kiocb, &iter); - else + else if (req->file->f_op->read) ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + else + ret2 = -EINVAL; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -2776,8 +2803,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); - else + else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + else + ret2 = -EINVAL; if (!force_nonblock) current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; @@ -4088,22 +4117,22 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; + int ret, notify; /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) + notify = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); if (!ret) wake_up_process(tsk); + return ret; } @@ -4124,6 +4153,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); + percpu_ref_get(&req->ctx->refs); + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead @@ -4160,9 +4191,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } -static void io_poll_remove_double(struct io_kiocb *req, void *data) +static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) +{ + /* pure poll stashes this in ->io, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return (struct io_poll_iocb *) req->io; + return req->apoll->double_poll; +} + +static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return &req->poll; + return &req->apoll->poll; +} + +static void io_poll_remove_double(struct io_kiocb *req) { - struct io_poll_iocb *poll = data; + struct io_poll_iocb *poll = io_poll_get_double(req); lockdep_assert_held(&req->ctx->completion_lock); @@ -4182,7 +4228,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; - io_poll_remove_double(req, req->io); + io_poll_remove_double(req); req->poll.done = true; io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); @@ -4208,6 +4254,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) static void io_poll_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); @@ -4218,13 +4265,14 @@ static void io_poll_task_func(struct callback_head *cb) __io_queue_sqe(nxt, NULL); mutex_unlock(&ctx->uring_lock); } + percpu_ref_put(&ctx->refs); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = req->apoll->double_poll; + struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); /* for instances that support it check for an event match first: */ @@ -4238,6 +4286,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); + /* make sure double remove sees this as being gone */ + wait->private = NULL; spin_unlock(&poll->head->lock); if (!done) __io_async_wake(req, poll, mask, io_poll_task_func); @@ -4313,7 +4363,8 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, struct io_kiocb *req) { if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -4332,6 +4383,7 @@ static void io_async_task_func(struct callback_head *cb) if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); return; } @@ -4346,7 +4398,7 @@ static void io_async_task_func(struct callback_head *cb) } } - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); /* restore ->work in case we need to retry again */ @@ -4356,7 +4408,6 @@ static void io_async_task_func(struct callback_head *cb) kfree(apoll); if (!canceled) { - __set_current_state(TASK_RUNNING); if (io_sq_thread_acquire_mm(ctx, req)) { io_cqring_add_event(req, -EFAULT); goto end_req; @@ -4370,6 +4421,7 @@ end_req: req_set_fail_links(req); io_double_put_req(req); } + percpu_ref_put(&ctx->refs); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -4472,8 +4524,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret) { - io_poll_remove_double(req, apoll->double_poll); + if (ret || ipt.error) { + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); @@ -4507,14 +4559,13 @@ static bool io_poll_remove_one(struct io_kiocb *req) { bool do_complete; + io_poll_remove_double(req); + if (req->opcode == IORING_OP_POLL_ADD) { - io_poll_remove_double(req, req->io); do_complete = __io_poll_remove_one(req, &req->poll); } else { struct async_poll *apoll = req->apoll; - io_poll_remove_double(req, apoll->double_poll); - /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { @@ -4536,6 +4587,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); req->flags |= REQ_F_COMP_LOCKED; + req_set_fail_links(req); io_put_req(req); } @@ -4709,6 +4761,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int __io_timeout_cancel(struct io_kiocb *req) +{ + int ret; + + list_del_init(&req->list); + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret == -1) + return -EALREADY; + + req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) { struct io_kiocb *req; @@ -4716,7 +4785,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) list_for_each_entry(req, &ctx->timeout_list, list) { if (user_data == req->user_data) { - list_del_init(&req->list); ret = 0; break; } @@ -4725,14 +4793,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); - if (ret == -1) - return -EALREADY; - - req_set_fail_links(req); - io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); - return 0; + return __io_timeout_cancel(req); } static int io_timeout_remove_prep(struct io_kiocb *req, @@ -6082,8 +6143,7 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6115,8 +6175,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); continue; } @@ -6145,8 +6204,7 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); io_sq_thread_drop_mm(ctx); revert_creds(old_cred); @@ -6211,9 +6269,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6235,8 +6292,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7086,6 +7143,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7093,9 +7153,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } @@ -7488,6 +7545,98 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) return work->files == files; } +/* + * Returns true if 'preq' is the link parent of 'req' + */ +static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) +{ + struct io_kiocb *link; + + if (!(preq->flags & REQ_F_LINK_HEAD)) + return false; + + list_for_each_entry(link, &preq->link_list, link_list) { + if (link == req) + return true; + } + + return false; +} + +/* + * We're looking to cancel 'req' because it's holding on to our files, but + * 'req' could be a link to another request. See if it is, and cancel that + * parent request if so. + */ +static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hlist_node *tmp; + struct io_kiocb *preq; + bool found = false; + int i; + + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(preq, tmp, list, hash_node) { + found = io_match_link(preq, req); + if (found) { + io_poll_remove_one(preq); + break; + } + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_timeout_remove_link(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_kiocb *preq; + bool found = false; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry(preq, &ctx->timeout_list, list) { + found = io_match_link(preq, req); + if (found) { + __io_timeout_cancel(preq); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -7529,10 +7678,10 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, clear_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } - spin_unlock_irq(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's @@ -7544,7 +7693,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, continue; } } else { - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); } @@ -7655,8 +7805,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; @@ -7828,6 +7977,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_rings *rings; size_t size, sq_array_offset; + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -7844,8 +7997,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; ctx->sq_mask = rings->sq_ring_mask; ctx->cq_mask = rings->cq_ring_mask; - ctx->sq_entries = rings->sq_ring_entries; - ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e4944436e733..5493a0da23dd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1367,8 +1367,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) int ret; /* Buffer got discarded which means block device got invalidated */ - if (!buffer_mapped(bh)) + if (!buffer_mapped(bh)) { + unlock_buffer(bh); return -EIO; + } trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f20cff1194bb..776493713153 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -590,10 +590,14 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) int ret; uint32_t now = JFFS2_NOW(); + mutex_lock(&f->sem); for (fd = f->dents ; fd; fd = fd->next) { - if (fd->ino) + if (fd->ino) { + mutex_unlock(&f->sem); return -ENOTEMPTY; + } } + mutex_unlock(&f->sem); ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 06b342d8462b..e23b3f62483c 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -912,7 +912,7 @@ repeat: } fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, - &name, 0); + NULL, 0); iput(inode); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 7cb5fd38eb14..7b09a9158e40 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -150,6 +150,25 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) return 0; } +static bool minix_check_superblock(struct super_block *sb) +{ + struct minix_sb_info *sbi = minix_sb(sb); + + if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) + return false; + + /* + * s_max_size must not exceed the block mapping limitation. This check + * is only needed for V1 filesystems, since V2/V3 support an extra level + * of indirect blocks which places the limit well above U32_MAX. + */ + if (sbi->s_version == MINIX_V1 && + sb->s_maxbytes > (7 + 512 + 512*512) * BLOCK_SIZE) + return false; + + return true; +} + static int minix_fill_super(struct super_block *s, void *data, int silent) { struct buffer_head *bh; @@ -185,7 +204,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) sbi->s_zmap_blocks = ms->s_zmap_blocks; sbi->s_firstdatazone = ms->s_firstdatazone; sbi->s_log_zone_size = ms->s_log_zone_size; - sbi->s_max_size = ms->s_max_size; + s->s_maxbytes = ms->s_max_size; s->s_magic = ms->s_magic; if (s->s_magic == MINIX_SUPER_MAGIC) { sbi->s_version = MINIX_V1; @@ -216,7 +235,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) sbi->s_zmap_blocks = m3s->s_zmap_blocks; sbi->s_firstdatazone = m3s->s_firstdatazone; sbi->s_log_zone_size = m3s->s_log_zone_size; - sbi->s_max_size = m3s->s_max_size; + s->s_maxbytes = m3s->s_max_size; sbi->s_ninodes = m3s->s_ninodes; sbi->s_nzones = m3s->s_zones; sbi->s_dirsize = 64; @@ -228,11 +247,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) } else goto out_no_fs; + if (!minix_check_superblock(s)) + goto out_illegal_sb; + /* * Allocate the buffer map to keep the superblock small. */ - if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) - goto out_illegal_sb; i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); map = kzalloc(i, GFP_KERNEL); if (!map) @@ -468,6 +488,13 @@ static struct inode *V1_minix_iget(struct inode *inode) iget_failed(inode); return ERR_PTR(-EIO); } + if (raw_inode->i_nlinks == 0) { + printk("MINIX-fs: deleted inode referenced: %lu\n", + inode->i_ino); + brelse(bh); + iget_failed(inode); + return ERR_PTR(-ESTALE); + } inode->i_mode = raw_inode->i_mode; i_uid_write(inode, raw_inode->i_uid); i_gid_write(inode, raw_inode->i_gid); @@ -501,6 +528,13 @@ static struct inode *V2_minix_iget(struct inode *inode) iget_failed(inode); return ERR_PTR(-EIO); } + if (raw_inode->i_nlinks == 0) { + printk("MINIX-fs: deleted inode referenced: %lu\n", + inode->i_ino); + brelse(bh); + iget_failed(inode); + return ERR_PTR(-ESTALE); + } inode->i_mode = raw_inode->i_mode; i_uid_write(inode, raw_inode->i_uid); i_gid_write(inode, raw_inode->i_gid); diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 043c3fdbc8e7..446148792f41 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -75,6 +75,7 @@ static int alloc_branch(struct inode *inode, int n = 0; int i; int parent = minix_new_block(inode); + int err = -ENOSPC; branch[0].key = cpu_to_block(parent); if (parent) for (n = 1; n < num; n++) { @@ -85,6 +86,11 @@ static int alloc_branch(struct inode *inode, break; branch[n].key = cpu_to_block(nr); bh = sb_getblk(inode->i_sb, parent); + if (!bh) { + minix_free_block(inode, nr); + err = -ENOMEM; + break; + } lock_buffer(bh); memset(bh->b_data, 0, bh->b_size); branch[n].bh = bh; @@ -103,7 +109,7 @@ static int alloc_branch(struct inode *inode, bforget(branch[i].bh); for (i = 0; i < n; i++) minix_free_block(inode, block_to_cpu(branch[i].key)); - return -ENOSPC; + return err; } static inline int splice_branch(struct inode *inode, diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c index 046cc96ee7ad..1fed906042aa 100644 --- a/fs/minix/itree_v1.c +++ b/fs/minix/itree_v1.c @@ -29,12 +29,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) if (block < 0) { printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", block, inode->i_sb->s_bdev); - } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) { - if (printk_ratelimit()) - printk("MINIX-fs: block_to_path: " - "block %ld too big on dev %pg\n", - block, inode->i_sb->s_bdev); - } else if (block < 7) { + return 0; + } + if ((u64)block * BLOCK_SIZE >= inode->i_sb->s_maxbytes) + return 0; + + if (block < 7) { offsets[n++] = block; } else if ((block -= 7) < 512) { offsets[n++] = 7; diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index f7fc7ecccccc..9d00f31a2d9d 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -32,13 +32,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) if (block < 0) { printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", block, sb->s_bdev); - } else if ((u64)block * (u64)sb->s_blocksize >= - minix_sb(sb)->s_max_size) { - if (printk_ratelimit()) - printk("MINIX-fs: block_to_path: " - "block %ld too big on dev %pg\n", - block, sb->s_bdev); - } else if (block < DIRCOUNT) { + return 0; + } + if ((u64)block * (u64)sb->s_blocksize >= sb->s_maxbytes) + return 0; + + if (block < DIRCOUNT) { offsets[n++] = block; } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { offsets[n++] = DIRCOUNT; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index df081e8afcc3..168d45d3de73 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -32,7 +32,6 @@ struct minix_sb_info { unsigned long s_zmap_blocks; unsigned long s_firstdatazone; unsigned long s_log_zone_size; - unsigned long s_max_size; int s_dirsize; int s_namelen; struct buffer_head ** s_imap; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f96367a2463e..63940a7a70be 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -140,6 +140,7 @@ static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); + errseq_t since; dprintk("NFS: flush(%pD2)\n", file); @@ -148,7 +149,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return nfs_wb_all(inode); + since = filemap_sample_wb_err(file->f_mapping); + nfs_wb_all(inode); + return filemap_check_wb_err(file->f_mapping, since); } ssize_t @@ -587,12 +590,14 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode) +static int nfs_need_check_write(struct file *filp, struct inode *inode, + int error) { struct nfs_open_context *ctx; ctx = nfs_file_open_context(filp); - if (nfs_ctx_key_to_expire(ctx, inode)) + if (nfs_error_is_fatal_on_server(error) || + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } @@ -603,6 +608,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; + errseq_t since; + int error; result = nfs_key_timeout_notify(file, inode); if (result) @@ -627,6 +634,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos > i_size_read(inode)) nfs_revalidate_mapping(inode, file->f_mapping); + since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); if (result > 0) { @@ -645,7 +653,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; /* Return error values */ - if (nfs_need_check_write(file, inode)) { + error = filemap_check_wb_err(file->f_mapping, since); + if (nfs_need_check_write(file, inode, error)) { int err = nfs_wb_all(inode); if (err < 0) result = err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index de03e440b7ee..048272d60a16 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -790,6 +790,19 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } +static struct nfs4_pnfs_ds * +ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx) +{ + struct pnfs_layout_segment *lseg = pgio->pg_lseg; + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, + best_idx); + if (ds || !pgio->pg_mirror_idx) + return ds; + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); +} + static void ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, @@ -840,7 +853,7 @@ retry: goto out_nolseg; } - ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); + ds = ff_layout_get_ds_for_read(pgio, &ds_idx); if (!ds) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -1028,11 +1041,24 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) } } +static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) +{ + u32 idx = hdr->pgio_mirror_idx + 1; + int new_idx = 0; + + if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) + ff_layout_send_layouterror(hdr->lseg); + else + pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + pnfs_read_resend_pnfs(hdr, new_idx); +} + static void ff_layout_reset_read(struct nfs_pgio_header *hdr) { struct rpc_task *task = &hdr->task; pnfs_layoutcommit_inode(hdr->inode, false); + pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " @@ -1234,6 +1260,12 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, break; case NFS4ERR_NXIO: ff_layout_mark_ds_unreachable(lseg, idx); + /* + * Don't return the layout if this is a read and we still + * have layouts to try + */ + if (opnum == OP_READ) + break; /* Fallthrough */ default: pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, @@ -1247,7 +1279,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - int new_idx = hdr->pgio_mirror_idx; int err; if (task->tk_status < 0) { @@ -1267,10 +1298,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - if (ff_layout_choose_best_ds_for_read(hdr->lseg, - hdr->pgio_mirror_idx + 1, - &new_idx)) - goto out_layouterror; set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1281,10 +1308,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, } return 0; -out_layouterror: - ff_layout_read_record_layoutstats_done(task, hdr); - ff_layout_send_layouterror(hdr->lseg); - hdr->pgio_mirror_idx = new_idx; out_eagain: rpc_restart_call_prepare(task); return -EAGAIN; @@ -1411,10 +1434,9 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { - ff_layout_send_layouterror(hdr->lseg); - pnfs_read_resend_pnfs(hdr); - } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + ff_layout_resend_pnfs_read(hdr); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8e5d6223ddd3..a33970765467 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -110,6 +110,7 @@ static int nfs4_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); + errseq_t since; dprintk("NFS: flush(%pD2)\n", file); @@ -125,7 +126,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id) return filemap_fdatawrite(file->f_mapping); /* Flush writes to the server and return any errors */ - return nfs_wb_all(inode); + since = filemap_sample_wb_err(file->f_mapping); + nfs_wb_all(inode); + return filemap_check_wb_err(file->f_mapping, since); } #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2e2dac29a9e9..45e0585e0667 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5845,8 +5845,6 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) return -ENOENT; - if (buflen < label.len) - return -ERANGE; return 0; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 47817ef0aadb..4e0d8a3b89b6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4166,7 +4166,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, return -EIO; if (len < NFS4_MAXLABELLEN) { if (label) { - memcpy(label->label, p, len); + if (label->len) { + if (label->len < len) + return -ERANGE; + memcpy(label->label, p, len); + } label->len = len; label->pi = pi; label->lfs = lfs; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dd2e14f5875d..75e988caf3cd 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1226,31 +1226,27 @@ out: return status; } +static bool +pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo, + enum pnfs_iomode iomode, + u32 seq) +{ + struct pnfs_layout_range recall_range = { + .length = NFS4_MAX_UINT64, + .iomode = iomode, + }; + return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + &recall_range, seq) != -EBUSY; +} + /* Return true if layoutreturn is needed */ static bool pnfs_layout_need_return(struct pnfs_layout_hdr *lo) { - struct pnfs_layout_segment *s; - enum pnfs_iomode iomode; - u32 seq; - if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) return false; - - seq = lo->plh_return_seq; - iomode = lo->plh_return_iomode; - - /* Defer layoutreturn until all recalled lsegs are done */ - list_for_each_entry(s, &lo->plh_segs, pls_list) { - if (seq && pnfs_seqid_is_newer(s->pls_seq, seq)) - continue; - if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode) - continue; - if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) - return false; - } - - return true; + return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode, + lo->plh_return_seq); } static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) @@ -2392,16 +2388,6 @@ out_forget: return ERR_PTR(-EAGAIN); } -static int -mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg, - struct list_head *tmp_list) -{ - if (!mark_lseg_invalid(lseg, tmp_list)) - return 0; - pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg); - return 1; -} - /** * pnfs_mark_matching_lsegs_return - Free or return matching layout segments * @lo: pointer to layout header @@ -2438,7 +2424,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - if (mark_lseg_invalid_or_return(lseg, tmp_list)) + if (mark_lseg_invalid(lseg, tmp_list)) continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); @@ -2953,7 +2939,8 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr, + unsigned int mirror_idx) { struct nfs_pageio_descriptor pgio; @@ -2964,6 +2951,7 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + pgio.pg_mirror_idx = mirror_idx; hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); } } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 8e0ada581b92..2661c44c62db 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -311,7 +311,7 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -void pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9e40dfecf1b1..186fa2c2c6ba 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -747,13 +747,11 @@ struct cld_upcall { }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; struct rpc_pipe_msg msg; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); - struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info, - nfsd_net_id); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; @@ -773,7 +771,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; @@ -782,7 +780,7 @@ cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) * upcalls queued. */ do { - ret = __cld_pipe_upcall(pipe, cmsg); + ret = __cld_pipe_upcall(pipe, cmsg, nn); } while (ret == -EAGAIN); return ret; @@ -1115,7 +1113,7 @@ nfsd4_cld_create(struct nfs4_client *clp) memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1180,7 +1178,7 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; - ret = cld_pipe_upcall(cn->cn_pipe, cmsg); + ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn); if (!ret) { ret = cmsg->cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1218,7 +1216,7 @@ nfsd4_cld_remove(struct nfs4_client *clp) memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1261,7 +1259,7 @@ nfsd4_cld_check_v0(struct nfs4_client *clp) memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1404,7 +1402,7 @@ nfsd4_cld_grace_start(struct nfsd_net *nn) } cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; @@ -1432,7 +1430,7 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn) cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; @@ -1460,7 +1458,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) } cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; @@ -1524,7 +1522,7 @@ nfsd4_cld_get_version(struct nfsd_net *nn) goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; if (ret) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 751bc4dc7466..8e3a369086db 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2871,9 +2871,15 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, 0, 0); - if (status < 0) + if (status < 0) { mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + if (ex) + up_write(&osb->nfs_sync_rwlock); + else + up_read(&osb->nfs_sync_rwlock); + } + return status; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 2dd71d626196..7993d527edae 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -327,8 +327,8 @@ struct ocfs2_super spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; - s16 s_inode_steal_slot; - s16 s_meta_steal_slot; + u16 s_inode_steal_slot; + u16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; atomic_t s_num_meta_stolen; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 45745cc3408a..8c8cf7f4eb34 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -879,9 +879,9 @@ static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) { spin_lock(&osb->osb_lock); if (type == INODE_ALLOC_SYSTEM_INODE) - osb->s_inode_steal_slot = slot; + osb->s_inode_steal_slot = (u16)slot; else if (type == EXTENT_ALLOC_SYSTEM_INODE) - osb->s_meta_steal_slot = slot; + osb->s_meta_steal_slot = (u16)slot; spin_unlock(&osb->osb_lock); } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 71ea9ce71a6b..1d91dd1e8711 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -78,7 +78,7 @@ struct mount_options unsigned long commit_interval; unsigned long mount_opt; unsigned int atime_quantum; - signed short slot; + unsigned short slot; int localalloc_opt; unsigned int resv_level; int dir_resv_level; @@ -1349,7 +1349,7 @@ static int ocfs2_parse_options(struct super_block *sb, goto bail; } if (option) - mopt->slot = (s16)option; + mopt->slot = (u16)option; break; case Opt_commit: if (match_int(&args[0], &option)) { diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index a9e297eefdff..36714df37d5d 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -269,6 +269,9 @@ static int pstore_compress(const void *in, void *out, { int ret; + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION)) + return -EINVAL; + ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); if (ret) { pr_err("crypto_comp_compress failed, ret = %d!\n", ret); @@ -668,7 +671,7 @@ static void decompress_record(struct pstore_record *record) int unzipped_len; char *unzipped, *workspace; - if (!record->compressed) + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION) || !record->compressed) return; /* Only PSTORE_TYPE_DMESG support compression. */ diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 6b2b4362089e..b57b3ffcbc32 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -217,10 +217,8 @@ int romfs_dev_read(struct super_block *sb, unsigned long pos, size_t limit; limit = romfs_maxsize(sb); - if (pos >= limit) + if (pos >= limit || buflen > limit - pos) return -EIO; - if (buflen > limit - pos) - buflen = limit - pos; #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) diff --git a/fs/signalfd.c b/fs/signalfd.c index 44b6845b071c..5b78719be445 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -314,9 +314,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, { sigset_t mask; - if (sizemask != sizeof(sigset_t) || - copy_from_user(&mask, user_mask, sizeof(mask))) + if (sizemask != sizeof(sigset_t)) return -EINVAL; + if (copy_from_user(&mask, user_mask, sizeof(mask))) + return -EFAULT; return do_signalfd4(ufd, &mask, flags); } @@ -325,9 +326,10 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, { sigset_t mask; - if (sizemask != sizeof(sigset_t) || - copy_from_user(&mask, user_mask, sizeof(mask))) + if (sizemask != sizeof(sigset_t)) return -EINVAL; + if (copy_from_user(&mask, user_mask, sizeof(mask))) + return -EFAULT; return do_signalfd4(ufd, &mask, 0); } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 76bb1c846845..8a19773b5a0b 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,11 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - bio = bio_alloc(GFP_NOIO, page_count); + if (page_count <= BIO_MAX_PAGES) + bio = bio_alloc(GFP_NOIO, page_count); + else + bio = bio_kmalloc(GFP_NOIO, page_count); + if (!bio) return -ENOMEM; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index e5ec1afe1c66..2cf05f87565c 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -539,7 +539,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent) { - int err, dlen, ilen, len, lnum, ino_offs, dent_offs; + int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0; int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); int last_reference = !!(deletion && inode->i_nlink == 0); struct ubifs_inode *ui = ubifs_inode(inode); @@ -630,6 +630,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, goto out_finish; } ui->del_cmtno = c->cmt_no; + orphan_added = 1; } err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); @@ -702,7 +703,7 @@ out_release: kfree(dent); out_ro: ubifs_ro_mode(c, err); - if (last_reference) + if (orphan_added) ubifs_delete_orphan(c, inode->i_ino); finish_reservation(c); return err; @@ -1218,7 +1219,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len; + int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); @@ -1334,6 +1335,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, goto out_finish; } new_ui->del_cmtno = c->cmt_no; + orphan_added = 1; } err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); @@ -1415,7 +1417,7 @@ out_release: release_head(c, BASEHD); out_ro: ubifs_ro_mode(c, err); - if (last_reference) + if (orphan_added) ubifs_delete_orphan(c, new_inode->i_ino); out_finish: finish_reservation(c); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 1da0be667409..e3b69fb280e8 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -101,7 +101,7 @@ static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gene struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct inode *inode; - if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg) + if (ino < UFS_ROOTINO || ino > (u64)uspi->s_ncg * uspi->s_ipg) return ERR_PTR(-ESTALE); inode = ufs_iget(sb, ino); diff --git a/fs/xattr.c b/fs/xattr.c index 91608d9bfc6a..95f38f57347f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -204,10 +204,22 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, return error; } - +/** + * __vfs_setxattr_locked: set an extended attribute while holding the inode + * lock + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * @delegated_inode - on return, will contain an inode pointer that + * a delegation was broken on, NULL if none. + */ int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +__vfs_setxattr_locked(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -216,15 +228,40 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; + error = try_break_deleg(inode, delegated_inode); + if (error) + goto out; + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: + return error; +} +EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct inode *delegated_inode = NULL; + int error; + +retry_deleg: + inode_lock(inode); + error = __vfs_setxattr_locked(dentry, name, value, size, flags, + &delegated_inode); inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -378,8 +415,18 @@ __vfs_removexattr(struct dentry *dentry, const char *name) } EXPORT_SYMBOL(__vfs_removexattr); +/** + * __vfs_removexattr_locked: set an extended attribute while holding the inode + * lock + * + * @dentry - object to perform setxattr on + * @name - name of xattr to remove + * @delegated_inode - on return, will contain an inode pointer that + * a delegation was broken on, NULL if none. + */ int -vfs_removexattr(struct dentry *dentry, const char *name) +__vfs_removexattr_locked(struct dentry *dentry, const char *name, + struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -388,11 +435,14 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; + error = try_break_deleg(inode, delegated_inode); + if (error) + goto out; + error = __vfs_removexattr(dentry, name); if (!error) { @@ -401,12 +451,32 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: + return error; +} +EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); + +int +vfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct inode *delegated_inode = NULL; + int error; + +retry_deleg: + inode_lock(inode); + error = __vfs_removexattr_locked(dentry, name, &delegated_inode); inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); - /* * Extended attribute SET operations */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c45acbd3add9..708feb8eac76 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 88221c7a04cc..c6df01a2a158 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -57,7 +57,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ (M_IGEO(mp)->inobt_maxlevels - 1))) /* diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 7badd6dfe544..955302e7cdde 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -45,9 +45,27 @@ xchk_setup_inode_bmap( */ if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + inode_dio_wait(VFS_I(sc->ip)); - error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); - if (error) + + /* + * Try to flush all incore state to disk before we examine the + * space mappings for the data fork. Leave accumulated errors + * in the mapping for the writer threads to consume. + * + * On ENOSPC or EIO writeback errors, we continue into the + * extent mapping checks because write failures do not + * necessarily imply anything about the correctness of the file + * metadata. The metadata and the file data could be on + * completely separate devices; a media failure might only + * affect a subset of the disk, etc. We can handle delalloc + * extents in the scrubber, so leaving them in memory is fine. + */ + error = filemap_fdatawrite(mapping); + if (!error) + error = filemap_fdatawait_keep_errors(mapping); + if (error && (error != -ENOSPC && error != -EIO)) goto out; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f37f5cc4b19f..afdc7f8e0e70 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1567,6 +1567,7 @@ xfs_swap_extents( int lock_flags; uint64_t f; int resblks = 0; + unsigned int flags = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1630,17 +1631,16 @@ xfs_swap_extents( resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); /* - * Handle the corner case where either inode might straddle the - * btree format boundary. If so, the inode could bounce between - * btree <-> extent format on unmap -> remap cycles, freeing and - * allocating a bmapbt block each time. + * If either inode straddles a bmapbt block allocation boundary, + * the rmapbt algorithm triggers repeated allocs and frees as + * extents are remapped. This can exhaust the block reservation + * prematurely and cause shutdown. Return freed blocks to the + * transaction reservation to counter this behavior. */ - if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(ip, w); - if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(tip, w); + flags |= XFS_TRANS_RES_FDBLKS; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, + &tp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d6cd83317344..938023dd8ce5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -148,6 +148,7 @@ xfs_qm_dqpurge( error = xfs_bwrite(bp); xfs_buf_relse(bp); } else if (error == -EAGAIN) { + dqp->dq_flags &= ~XFS_DQ_FREEING; goto out_unlock; } xfs_dqflock(dqp); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 107bf2a2f344..d89201d40891 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1003,6 +1003,7 @@ xfs_reflink_remap_extent( xfs_filblks_t rlen; xfs_filblks_t unmap_len; xfs_off_t newlen; + int64_t qres; int error; unmap_len = irec->br_startoff + irec->br_blockcount - destoff; @@ -1025,13 +1026,19 @@ xfs_reflink_remap_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* If we're not just clearing space, then do we have enough quota? */ - if (real_extent) { - error = xfs_trans_reserve_quota_nblks(tp, ip, - irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_cancel; - } + /* + * Reserve quota for this operation. We don't know if the first unmap + * in the dest file will cause a bmap btree split, so we always reserve + * at least enough blocks for that split. If the extent being mapped + * in is written, we need to reserve quota for that too. + */ + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (real_extent) + qres += irec->br_blockcount; + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; trace_xfs_reflink_remap(ip, irec->br_startoff, irec->br_blockcount, irec->br_startblock); diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index e9f810fc6731..43585850f154 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -32,9 +32,11 @@ xfs_sysfs_init( struct xfs_kobj *parent_kobj, const char *name) { + struct kobject *parent; + + parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, - &parent_kobj->kobject, "%s", name); + return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); } static inline void diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3c94e5ff4316..0ad72a83edac 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -107,7 +107,8 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | + (tp->t_flags & XFS_TRANS_RES_FDBLKS); /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); @@ -272,6 +273,8 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || + xfs_sb_version_haslazysbcount(&mp->m_sb)); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -365,6 +368,20 @@ xfs_trans_mod_sb( tp->t_blk_res_used += (uint)-delta; if (tp->t_blk_res_used > tp->t_blk_res) xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { + int64_t blkres_delta; + + /* + * Return freed blocks directly to the reservation + * instead of the global pool, being careful not to + * overflow the trans counter. This is used to preserve + * reservation across chains of transaction rolls that + * repeatedly free and allocate blocks. + */ + blkres_delta = min_t(int64_t, delta, + UINT_MAX - tp->t_blk_res); + tp->t_blk_res += blkres_delta; + delta -= blkres_delta; } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c0f73b82c055..ed0ce8b301b4 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -647,7 +647,7 @@ xfs_trans_dqresv( } } if (ninos > 0) { - total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; + total_count = dqp->q_res_icount + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); warnlimit = defq->iwarnlimit; |