diff options
Diffstat (limited to 'fs/btrfs/ordered-data.c')
-rw-r--r-- | fs/btrfs/ordered-data.c | 359 |
1 files changed, 221 insertions, 138 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8a3c46cb67f5..880f9553d79d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,7 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" +#include "block-group.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, * look find the first ordered struct that has this offset, otherwise * the first one less than this offset */ -static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - u64 file_offset) +static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, + u64 file_offset) { - struct rb_root *root = &tree->tree; struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; - if (tree->last) { - entry = rb_entry(tree->last, struct btrfs_ordered_extent, + if (inode->ordered_tree_last) { + entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, rb_node); if (in_range(file_offset, entry->file_offset, entry->num_bytes)) - return tree->last; + return inode->ordered_tree_last; } - ret = __tree_search(root, file_offset, &prev); + ret = __tree_search(&inode->ordered_tree, file_offset, &prev); if (!ret) ret = prev; if (ret) - tree->last = ret; + inode->ordered_tree_last = ret; return ret; } @@ -154,9 +153,10 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { + if (is_nocow) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) @@ -171,8 +171,13 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( return ERR_PTR(ret); } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) + if (!entry) { + if (!is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(-ENOMEM); + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -181,7 +186,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = igrab(&inode->vfs_inode); + entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -192,6 +197,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); + INIT_LIST_HEAD(&entry->bioc_list); init_completion(&entry->completion); /* @@ -208,8 +214,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( static void insert_ordered_extent(struct btrfs_ordered_extent *entry) { - struct btrfs_inode *inode = BTRFS_I(entry->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_inode *inode = entry->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -222,13 +227,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); - if (node) + spin_lock_irq(&inode->ordered_tree_lock); + node = tree_insert(&inode->ordered_tree, entry->file_offset, + &entry->rb_node); + if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -253,7 +259,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -264,17 +270,39 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) */ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) + const struct btrfs_file_extent *file_extent, unsigned long flags) { struct btrfs_ordered_extent *entry; ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); - entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, - disk_bytenr, disk_num_bytes, offset, flags, - compress_type); + /* + * For regular writes, we just use the members in @file_extent. + * + * For NOCOW, we don't really care about the numbers except @start and + * file_extent->num_bytes, as we won't insert a file extent item at all. + * + * For PREALLOC, we do not use ordered extent members, but + * btrfs_mark_extent_written() handles everything. + * + * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents, + * or btrfs_split_ordered_extent() cannot handle it correctly. + */ + if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))) + entry = alloc_ordered_extent(inode, file_offset, + file_extent->num_bytes, + file_extent->num_bytes, + file_extent->disk_bytenr + file_extent->offset, + file_extent->num_bytes, 0, flags, + file_extent->compression); + else + entry = alloc_ordered_extent(inode, file_offset, + file_extent->num_bytes, + file_extent->ram_bytes, + file_extent->disk_bytenr, + file_extent->disk_num_bytes, + file_extent->offset, flags, + file_extent->compression); if (!IS_ERR(entry)) insert_ordered_extent(entry); return entry; @@ -288,12 +316,17 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_inode *inode = entry->inode; - tree = &BTRFS_I(entry->inode)->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); +} + +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) +{ + if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO); } static void finish_ordered_fn(struct btrfs_work *work) @@ -305,18 +338,18 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - lockdep_assert_held(&inode->ordered_tree.lock); + lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* * Ordered (Private2) bit indicates whether we still have @@ -324,16 +357,16 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_page_clear_ordered(fs_info, page, file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ if (WARN_ON_ONCE(len > ordered->bytes_left)) { btrfs_crit(fs_info, "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", - inode->root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(inode->root), btrfs_ino(inode), ordered->file_offset, ordered->num_bytes, len, ordered->bytes_left); ordered->bytes_left = 0; @@ -360,39 +393,70 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? fs_info->endio_freespace_worker : fs_info->endio_write_workers; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); btrfs_queue_work(wq, &ordered->work); } -bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, +void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree.lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); if (ret) btrfs_queue_ordered_fn(ordered); - return ret; } /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -402,10 +466,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; @@ -415,13 +478,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, file_offset + num_bytes - 1, uptodate); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; u64 end; u32 len; - node = tree_search(tree, cur); + node = ordered_tree_search(inode, cur); /* No ordered extents at all */ if (!node) break; @@ -467,14 +530,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { - spin_unlock_irqrestore(&tree->lock, flags); + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); } cur += len; } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); } /* @@ -498,19 +561,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; bool finished = false; - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); if (cached && *cached) { entry = *cached; goto have_entry; } - node = tree_search(tree, file_offset); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -541,7 +603,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return finished; } @@ -554,14 +616,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); + trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(BTRFS_I(entry->inode)); + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -579,7 +641,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = btrfs_inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -593,7 +654,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, freespace_inode = btrfs_is_free_space_inode(btrfs_inode); btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); - /* This is paired with btrfs_alloc_ordered_extent. */ + /* This is paired with alloc_ordered_extent(). */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); @@ -612,16 +673,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - tree = &btrfs_inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (btrfs_inode->ordered_tree_last == node) + btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -680,11 +740,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) } /* - * wait for all the ordered extents in a root. This is done when balancing - * space between drives. + * Wait for all the ordered extents in a root. Use @bg as range or do whole + * range if it's NULL. */ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, - const u64 range_start, const u64 range_len) + const struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(splice); @@ -692,7 +752,17 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; u64 count = 0; - const u64 range_end = range_start + range_len; + u64 range_start, range_len; + u64 range_end; + + if (bg) { + range_start = bg->start; + range_len = bg->length; + } else { + range_start = 0; + range_len = U64_MAX; + } + range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -714,15 +784,15 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_run_ordered_extent_work, NULL, NULL); + btrfs_run_ordered_extent_work, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); cond_resched(); - spin_lock(&root->ordered_extent_lock); if (nr != U64_MAX) nr--; count++; + spin_lock(&root->ordered_extent_lock); } list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); @@ -739,8 +809,12 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, return count; } +/* + * Wait for @nr ordered extents that intersect the @bg, or the whole range of + * the filesystem if @bg is NULL. + */ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, - const u64 range_start, const u64 range_len) + const struct btrfs_block_group *bg) { struct btrfs_root *root; LIST_HEAD(splice); @@ -758,14 +832,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr, - range_start, range_len); + done = btrfs_wait_ordered_extents(root, nr, bg); btrfs_put_root(root); - spin_lock(&fs_info->ordered_root_lock); - if (nr != U64_MAX) { + if (nr != U64_MAX) nr -= done; - } + + spin_lock(&fs_info->ordered_root_lock); } list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); @@ -782,7 +855,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; - struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_inode *inode = entry->inode; bool freespace_inode; trace_btrfs_ordered_extent_start(inode, entry); @@ -809,7 +882,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) /* * Used to wait on ordered extents across a large range of bytes. */ -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) { int ret = 0; int ret_wb = 0; @@ -839,11 +912,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * before the ordered extents complete - to avoid failures (-EEXIST) * when adding the new ordered extents to the ordered tree. */ - ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end); end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); + ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -878,14 +951,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - tree = &inode->ordered_tree; - spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, file_offset); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -897,7 +968,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return entry; } @@ -907,15 +978,13 @@ out: struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) { - node = tree_search(tree, file_offset + len); + node = ordered_tree_search(inode, file_offset + len); if (!node) goto out; } @@ -939,7 +1008,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -950,13 +1019,12 @@ out: void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); - spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + spin_lock_irq(&inode->ordered_tree_lock); + for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); @@ -969,7 +1037,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } /* @@ -979,13 +1047,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -993,7 +1059,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1009,15 +1075,14 @@ out: struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct rb_node *cur; struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&tree->lock); - node = tree->tree.rb_node; + spin_lock_irq(&inode->ordered_tree_lock); + node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last * and screw up the search order. @@ -1071,7 +1136,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1149,8 +1214,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_inode *inode = ordered->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; @@ -1171,6 +1235,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); @@ -1189,15 +1265,32 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* One ref for the tree. */ refcount_inc(&new->refs); + /* + * Take the root's ordered_extent_lock to avoid a race with + * btrfs_wait_ordered_extents() when updating the disk_bytenr and + * disk_num_bytes fields of the ordered extent below. And we disable + * IRQs because the inode's ordered_tree_lock is used in IRQ context + * elsewhere. + * + * There's no concern about a previous caller of + * btrfs_wait_ordered_extents() getting the trimmed ordered extent + * before we insert the new one, because even if it gets the ordered + * extent before it's trimmed and the new one inserted, right before it + * uses it or during its use, the ordered extent might have been + * trimmed in the meanwhile, and it missed the new ordered extent. + * There's no way around this and it's harmless for current use cases, + * so we take the root's ordered_extent_lock to fix that race during + * trimming and silence tools like KCSAN. + */ spin_lock_irq(&root->ordered_extent_lock); - spin_lock(&tree->lock); - /* Remove from tree once */ - node = &ordered->rb_node; - rb_erase(node, &tree->tree); - RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + spin_lock(&inode->ordered_tree_lock); + /* + * We don't have overlapping ordered extents (that would imply double + * allocation of extents) and we checked above that the split length + * does not cross the ordered extent's num_bytes field, so there's + * no need to remove it and re-insert it in the tree. + */ ordered->file_offset += len; ordered->disk_bytenr += len; ordered->num_bytes -= len; @@ -1227,19 +1320,12 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( offset += sum->len; } - /* Re-insert the node */ - node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); - if (node) - btrfs_panic(fs_info, -EEXIST, - "zoned: inconsistency in ordered tree at offset %llu", - ordered->file_offset); - - node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); - if (node) + node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); + if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, - "zoned: inconsistency in ordered tree at offset %llu", + "inconsistency in ordered tree at offset %llu after split", new->file_offset); - spin_unlock(&tree->lock); + spin_unlock(&inode->ordered_tree_lock); list_add_tail(&new->root_extent_list, &root->ordered_extents); root->nr_ordered_extents++; @@ -1249,10 +1335,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; |