diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/backref.h | 4 | ||||
-rw-r--r-- | fs/btrfs/block-group.c | 3 | ||||
-rw-r--r-- | fs/btrfs/direct-io.c | 4 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 5 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 132 | ||||
-rw-r--r-- | fs/btrfs/extent_map.h | 3 | ||||
-rw-r--r-- | fs/btrfs/file.c | 21 | ||||
-rw-r--r-- | fs/btrfs/free-space-tree.c | 16 | ||||
-rw-r--r-- | fs/btrfs/fs.h | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 333 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 4 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 14 | ||||
-rw-r--r-- | fs/btrfs/raid56.c | 5 | ||||
-rw-r--r-- | fs/btrfs/super.c | 13 | ||||
-rw-r--r-- | fs/btrfs/tests/extent-io-tests.c | 6 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 377 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 6 | ||||
-rw-r--r-- | fs/btrfs/zstd.c | 2 |
19 files changed, 541 insertions, 411 deletions
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e8c22cccb5c1..7dfcc9351bce 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -427,8 +427,8 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1 << 0) -#define LINK_UPPER (1 << 1) +#define LINK_LOWER (1U << 0) +#define LINK_UPPER (1U << 1) void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, struct btrfs_backref_node *lower, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index aa8656c8b7e7..dd35e29d8082 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2780,8 +2780,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index bd38df5647e3..71984d7db839 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -151,8 +151,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT)); + (1U << type) | + (1U << BTRFS_ORDERED_DIRECT)); if (IS_ERR(ordered)) { if (em) { free_extent_map(em); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 147c50ef912a..e655fa3bfd9b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2168,8 +2168,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -4335,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->extent_map_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index fcb60837d7dc..039a73731135 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,7 +79,7 @@ enum { * single word in a bitmap may straddle two pages in the extent buffer. */ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1) #define BITMAP_FIRST_BYTE_MASK(start) \ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) #define BITMAP_LAST_BYTE_MASK(nbits) \ diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1d93e1202c33..36af9aa9aab1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,11 +1128,14 @@ struct btrfs_em_shrink_ctx { static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { - const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1144,28 +1147,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,36 +1188,89 @@ next: * lock. This is to avoid slowing other tasks trying to take the * lock. */ - if (need_resched() || rwlock_needbreak(&tree->lock)) + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; u64 min_ino = ctx->last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; ctx->last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { @@ -1254,16 +1294,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx return nr_dropped; } -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { + struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; + fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work); + ctx.scanned = 0; - ctx.nr_to_scan = nr_to_scan; + ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); /* * In case we have multiple tasks running this shrinker, make the next @@ -1281,12 +1324,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan, nr, ctx.last_root, ctx.last_ino); } - while (ctx.scanned < ctx.nr_to_scan) { + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; @@ -1344,5 +1387,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ctx.last_ino); } - return nr_dropped; + atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5154a8f1d26c..cd123b266b64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eaa991e69804..0e63603ac5c7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1912,6 +1912,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; + size_t fsize = folio_size(folio); vm_fault_t ret; int ret2; int reserved = 0; @@ -1922,7 +1923,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ASSERT(folio_order(folio) == 0); - reserved_space = PAGE_SIZE; + reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); @@ -1976,7 +1977,7 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); @@ -1988,11 +1989,11 @@ again: if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + data_reserved, end + 1, + fsize - reserved_space, true); } } @@ -2019,12 +2020,12 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); @@ -2033,7 +2034,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -2042,7 +2043,7 @@ out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7ba50e133921..308abbf8855b 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1104,11 +1104,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1138,8 +1148,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bb822e425d7f..374843aca60d 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -639,6 +639,8 @@ struct btrfs_fs_info { spinlock_t extent_map_shrinker_lock; u64 extent_map_shrinker_last_root; u64 extent_map_shrinker_last_ino; + atomic64_t extent_map_shrinker_nr_to_scan; + struct work_struct extent_map_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1ab5b0c1b9b7..f84e3f9fad84 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1249,7 +1249,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_COMPRESSED); + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1409,6 +1409,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + + /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the @@ -1452,8 +1463,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1470,6 +1486,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ lock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); @@ -1484,7 +1504,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_REGULAR); + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); @@ -1515,27 +1535,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered (Private2) bit so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_folio, &cached, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); - if (num_bytes < cur_alloc_size) + if (num_bytes < ram_size) num_bytes = 0; else - num_bytes -= cur_alloc_size; + num_bytes -= ram_size; alloc_hint = ins.objectid + ins.offset; - start += cur_alloc_size; + start += ram_size; extent_reserved = false; /* @@ -1546,6 +1551,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; @@ -1561,40 +1568,35 @@ out_unlock: * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| - * `- orig_start `- start `- start + cur_alloc_size `- end + * `- orig_start `- start `- start + ram_size `- end * * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_folio) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_folio, NULL, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } - /* - * At this point we're unlocked, we want to make sure we're only - * clearing these flags under the extent lock, so lock the rest of the - * range and clear everything up. - */ - lock_extent(&inode->io_tree, start, end, NULL); + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range @@ -1608,11 +1610,11 @@ out_unlock: */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size - 1, + start + ram_size - 1, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - start += cur_alloc_size; + btrfs_qgroup_free_data(inode, NULL, start, ram_size, NULL); + start += ram_size; } /* @@ -2055,6 +2057,63 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode, mapping_set_error(mapping, error); } +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error handler, + * as metadata for created ordered extent will only be freed by + * btrfs_finish_ordered_io(). + */ + return ret; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -2099,15 +2158,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2242,67 +2298,13 @@ must_cow: } } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -4732,7 +4734,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) @@ -4758,6 +4759,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out_notrans; } + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; @@ -4767,27 +4785,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!ret) { + if (!ret) btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } out: btrfs_end_transaction(trans); out_notrans: @@ -7997,6 +7999,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8120,6 +8123,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8177,30 +8205,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8250,6 +8271,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8384,6 +8406,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8432,7 +8477,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8448,6 +8493,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9683,8 +9732,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED)); + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3e3722a73239..1706f6d9b12e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -758,14 +758,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); - d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4ed11b089ea9..880f9553d79d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -153,9 +153,10 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { + if (is_nocow) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) @@ -170,8 +171,13 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( return ERR_PTR(ret); } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) + if (!entry) { + if (!is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(-ENOMEM); + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -253,7 +259,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 39bec672df0c..8afadf994b8c 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -200,8 +200,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -222,7 +221,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bcb8def4ade2..6119a06b0569 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,7 +28,6 @@ #include <linux/btrfs.h> #include <linux/security.h> #include <linux/fs_parser.h> -#include <linux/swap.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -2399,16 +2398,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); - /* - * We may be called from any task trying to allocate memory and we don't - * want to slow it down with scanning and dropping extent maps. It would - * also cause heavy lock contention if many tasks concurrently enter - * here. Therefore only allow kswapd tasks to scan and drop extent maps. - */ - if (!current_is_kswapd()) - return 0; + btrfs_free_extent_maps(fs_info, nr_to_scan); - return btrfs_free_extent_maps(fs_info, nr_to_scan); + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; } static const struct super_operations btrfs_super_ops = { diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..de226209220f 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -14,9 +14,9 @@ #include "../disk-io.h" #include "../btrfs_inode.h" -#define PROCESS_UNLOCK (1 << 0) -#define PROCESS_RELEASE (1 << 1) -#define PROCESS_TEST_LOCKED (1 << 2) +#define PROCESS_UNLOCK (1U << 0) +#define PROCESS_RELEASE (1U << 1) +#define PROCESS_TEST_LOCKED (1U << 2) static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9637c7cdc0cf..16b4474ded4b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,11 +138,14 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; struct inode *inode; + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(is_fstree(btrfs_root_id(root))); + /* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply @@ -154,7 +157,10 @@ static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) inode = btrfs_iget(objectid, root); memalloc_nofs_restore(nofs_flag); - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return BTRFS_I(inode); } /* @@ -610,21 +616,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - inode = NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -650,7 +641,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -674,23 +665,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + return 0; } - inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || @@ -724,7 +711,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -902,16 +889,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -948,7 +934,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -963,9 +949,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -973,10 +960,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1087,7 +1075,9 @@ again: search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { + if (ret < 0) { + return ret; + } else if (ret == 0) { struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; @@ -1149,7 +1139,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1160,13 +1150,13 @@ again: struct fscrypt_str victim_name; extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); + victim_name.len, &victim_name); if (ret) return ret; @@ -1181,18 +1171,18 @@ again: kfree(victim_name.name); return ret; } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + ret = PTR_ERR(victim_parent); + } else { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); + iput(&victim_parent->vfs_inode); } - iput(victim_parent); kfree(victim_name.name); if (ret) return ret; @@ -1326,19 +1316,18 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1370,8 +1359,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1404,15 +1393,17 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -1424,11 +1415,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); @@ -1436,8 +1429,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1448,8 +1441,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1459,12 +1451,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } @@ -1474,7 +1465,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, kfree(name.name); name.name = NULL; if (log_ref_ver) { - iput(dir); + iput(&dir->vfs_inode); dir = NULL; } } @@ -1487,8 +1478,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1497,8 +1487,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1670,12 +1662,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1697,14 +1690,14 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } - ret = fixup_inode_link_count(trans, inode); - iput(inode); + ret = fixup_inode_link_count(trans, &inode->vfs_inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1732,12 +1725,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1746,15 +1741,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1770,27 +1765,26 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { - iput(inode); - return -EIO; + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { + iput(&inode->vfs_inode); + return PTR_ERR(dir); } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1852,16 +1846,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -1882,9 +1876,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1899,9 +1892,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1956,11 +1948,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2117,16 +2109,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2163,9 +2155,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2173,9 +2166,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2185,7 +2177,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2309,7 +2302,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2317,14 +2310,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2386,7 +2382,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2480,30 +2476,28 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -5485,7 +5479,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5501,7 +5494,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5528,17 +5521,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5580,14 +5572,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5665,7 +5656,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5756,12 +5747,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5797,7 +5788,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5833,9 +5824,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5851,8 +5841,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5863,8 +5853,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6356,7 +6346,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6372,8 +6362,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6381,12 +6371,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6794,7 +6784,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6843,18 +6833,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6879,7 +6867,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6894,11 +6882,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7476,6 +7463,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8e6501860001..58e0cac5779d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3268,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 866607fd3e58..c9ea37fabf65 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -24,7 +24,7 @@ #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 -#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ |