diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/bio.c | 33 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 17 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 19 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 9 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 8 | ||||
-rw-r--r-- | fs/btrfs/file.c | 1 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 171 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 132 | ||||
-rw-r--r-- | fs/btrfs/locking.h | 10 | ||||
-rw-r--r-- | fs/btrfs/qgroup.c | 3 | ||||
-rw-r--r-- | fs/btrfs/ref-verify.c | 1 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 6 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 4 | ||||
-rw-r--r-- | fs/btrfs/send.c | 6 | ||||
-rw-r--r-- | fs/btrfs/super.c | 66 | ||||
-rw-r--r-- | fs/btrfs/sysfs.c | 10 | ||||
-rw-r--r-- | fs/btrfs/tree-checker.c | 27 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 4 | ||||
-rw-r--r-- | fs/btrfs/zlib.c | 4 | ||||
-rw-r--r-- | fs/btrfs/zoned.c | 5 |
20 files changed, 362 insertions, 174 deletions
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff6..7ea6f0b43b95 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -81,6 +81,9 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); + bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -355,7 +358,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +401,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +415,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } @@ -649,8 +652,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } @@ -678,7 +687,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); - goto fail; + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); @@ -686,7 +696,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + ret = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -760,6 +778,7 @@ fail: btrfs_bio_end_io(remaining, ret); } +end_bbio: btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 148648ea1c8b..185985a337b3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -654,6 +654,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -710,7 +712,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +752,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); @@ -2046,7 +2043,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info; struct extent_buffer *b; int slot; int ret; @@ -2059,6 +2056,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + if (!root) + return -EINVAL; + + fs_info = root->fs_info; might_sleep(); lowest_level = p->lowest_level; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c70..2c341956a01c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -371,6 +371,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi } /* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + +/* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 814320948645..eff0dd1ae62f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4262,6 +4262,15 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * already the cleaner, but below we run all pending delayed iputs. */ btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); /* * After we parked the cleaner kthread, ordered extents may have diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 412e318e4a22..3c6f7fecbb9a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2422,7 +2422,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, goto out; ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); + } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); @@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 588c353d2969..14e27473c5bc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -911,6 +911,7 @@ again: ret = PTR_ERR(folio); return ret; } + folio_wait_writeback(folio); /* Only support page sized folio yet. */ ASSERT(folio_order(folio) == 0); ret = set_folio_extent_mapped(folio); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 03fe0de2cd0d..27b2fe7f735d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3063,6 +3063,19 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); + } + if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3099,9 +3112,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { @@ -9068,9 +9078,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion done; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9089,14 +9099,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(&priv->done); } } bio_put(&bbio->bio); @@ -9116,8 +9126,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9130,7 +9140,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9145,11 +9155,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9158,8 +9168,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); /* See btrfs_encoded_read_endio() for ordering. */ ret = blk_status_to_errno(READ_ONCE(priv->status)); kfree(priv); @@ -9789,15 +9799,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the @@ -9806,22 +9826,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -9836,7 +9866,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9850,7 +9881,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9866,11 +9898,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -9878,24 +9912,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9907,23 +9956,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9958,7 +10029,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -9999,20 +10069,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10025,6 +10098,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c9302d193187..4d9305fa37a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4751,6 +4751,9 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss size_t page_offset; ssize_t ret; + /* The inode lock has already been acquired in btrfs_uring_read_extent. */ + btrfs_lockdep_inode_acquire(inode, i_rwsem); + if (priv->err) { ret = priv->err; goto out; @@ -4859,6 +4862,13 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, * and inode and freeing the allocations. */ + /* + * We're returning to userspace with the inode lock held, and that's + * okay - it'll get unlocked in a worker thread. Call + * btrfs_lockdep_inode_release() to avoid confusing lockdep. + */ + btrfs_lockdep_inode_release(inode, i_rwsem); + return -EIOCBQUEUED; out_fail: @@ -4868,25 +4878,29 @@ out_fail: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; - struct btrfs_ioctl_encoded_io_args args = { 0 }; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4900,43 +4914,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_encoded_io_args_32 args32; - copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); - if (copy_from_user(&args32, sqe_addr, copy_end)) { - ret = -EFAULT; - goto out_acct; - } - args.iov = compat_ptr(args32.iov); - args.iovcnt = args32.iovcnt; - args.offset = args32.offset; - args.flags = args32.flags; #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; - if (copy_from_user(&args, sqe_addr, copy_end)) { - ret = -EFAULT; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; goto out_acct; } - } - if (args.flags != 0) - return -EINVAL; + io_uring_cmd_get_async_data(cmd)->op_data = data; - ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), - &iov, &iter); - if (ret < 0) - goto out_acct; + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; - if (iov_iter_count(&iter) == 0) { - ret = 0; - goto out_free; + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } } - pos = args.offset; - ret = rw_verify_area(READ, file, &pos, args.len); + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; @@ -4949,15 +4984,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); - if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, - sizeof(args) - copy_end_kernel)) { + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -4967,40 +5003,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue } if (ret == -EIOCBQUEUED) { - u64 count; - - /* - * If we've optimized things by storing the iovecs on the stack, - * undo this. - */ - if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); - if (!iov) { - unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - ret = -ENOMEM; - goto out_acct; - } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); - } - - count = min_t(u64, iov_iter_count(&iter), disk_io_size); + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); /* Match ioctl by not returning past EOF if uncompressed. */ - if (!args.compression) - count = min_t(u64, count, args.len); + if (!data->args.compression) + count = min_t(u64, count, data->args.len); - ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, - cached_state, disk_bytenr, - disk_io_size, count, - args.compression, iov, cmd); + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); goto out_acct; } out_free: - kfree(iov); + kfree(data->iov); out_acct: if (ret > 0) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 46c8be2afab1..35036b151bf5 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -129,6 +129,16 @@ enum btrfs_lockdep_trans_states { rwsem_release(&owner->lock##_map, _THIS_IP_) /* + * Used to account for the fact that when doing io_uring encoded I/O, we can + * return to userspace with the inode lock still held. + */ +#define btrfs_lockdep_inode_acquire(owner, lock) \ + rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_) + +#define btrfs_lockdep_inode_release(owner, lock) \ + rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_) + +/* * Macros for the transaction states wait events, similar to the generic wait * event macros. */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b1..f9b214992212 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1254,8 +1255,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9522a8b79d22..2928abf7eb82 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -857,6 +857,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + rb_erase(&ref->node, &be->refs); kfree(ref); kfree(ra); goto out_unlock; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f8..db8b42f674b7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 204c928beaf9..531312efee8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc9..498c84323253 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); +again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 97a85d180b61..7dfe5005129a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1885,18 +1885,21 @@ static int btrfs_get_tree_super(struct fs_context *fc) if (sb->s_root) { btrfs_close_devices(fs_devices); - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) - ret = -EBUSY; + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ } else { snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; ret = btrfs_fill_super(sb, fs_devices); - } - - if (ret) { - deactivate_locked_super(sb); - return ret; + if (ret) { + deactivate_locked_super(sb); + return ret; + } } btrfs_clear_oneshot_options(fs_info); @@ -1982,39 +1985,18 @@ error: * btrfs or not, setting the whole super block RO. To make per-subvolume mounting * work with different options work we need to keep backward compatibility. */ -static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) { - struct vfsmount *mnt; - int ret; - const bool ro2rw = !(fc->sb_flags & SB_RDONLY); - - /* - * We got an EBUSY because our SB_RDONLY flag didn't match the existing - * super block, so invert our setting here and retry the mount so we - * can get our vfsmount. - */ - if (ro2rw) - fc->sb_flags |= SB_RDONLY; - else - fc->sb_flags &= ~SB_RDONLY; - - mnt = fc_mount(fc); - if (IS_ERR(mnt)) - return mnt; + int ret = 0; - if (!ro2rw) - return mnt; + if (fc->sb_flags & SB_RDONLY) + return ret; - /* We need to convert to rw, call reconfigure. */ - fc->sb_flags &= ~SB_RDONLY; down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_reconfigure(fc); + if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); up_write(&mnt->mnt_sb->s_umount); - if (ret) { - mntput(mnt); - return ERR_PTR(ret); - } - return mnt; + return ret; } static int btrfs_get_tree_subvol(struct fs_context *fc) @@ -2024,6 +2006,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) struct fs_context *dup_fc; struct dentry *dentry; struct vfsmount *mnt; + int ret = 0; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -2066,11 +2049,16 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->security = NULL; mnt = fc_mount(dup_fc); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) - mnt = btrfs_reconfigure_for_mount(dup_fc); - put_fs_context(dup_fc); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + put_fs_context(dup_fc); return PTR_ERR(mnt); + } + ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + put_fs_context(dup_fc); + if (ret) { + mntput(mnt); + return ret; + } /* * This free's ->subvol_name, because if it isn't set we have to diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b843308e2bc6..7f09b6c9cc2d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -295,7 +295,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ @@ -329,7 +329,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40e..dfeee033f31f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1cccaf9c2b0d..3d0ac8bdb21f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -797,6 +797,10 @@ static int get_canonical_dev_path(const char *dev_path, char *canonical) if (ret) goto out; resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) { + ret = PTR_ERR(resolved_path); + goto out; + } ret = strscpy(canonical, resolved_path, PATH_MAX); out: kfree(path_buf); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ddf0d5a448a7..c9e92c6941ec 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { unsigned int pg_off; unsigned int cur_len; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11ed523e528e..df905ae82929 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -748,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned |