diff options
Diffstat (limited to 'fs')
51 files changed, 4731 insertions, 1099 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9f1b1a88e317..b634c42115ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,5 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 +# Subset of W=1 warnings +subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += -Wmissing-format-attribute +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-include-dirs +subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) +subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) +subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) +subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +# The following turn off the warnings enabled by -Wextra +subdir-ccflags-y += -Wno-missing-field-initializers +subdir-ccflags-y += -Wno-sign-compare +subdir-ccflags-y += -Wno-type-limits + obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ @@ -11,7 +27,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o reflink.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ + subpage.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9cadacf3ec27..f47c1528eb9a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1501,7 +1501,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, } /** - * btrfs_check_shared - tell us whether an extent is shared + * Check if an extent is shared or not + * + * @root: root inode belongs to + * @inum: inode number of the inode whose extent we are checking + * @bytenr: logical bytenr of the extent we are checking + * @roots: list of roots this extent is shared among + * @tmp: temporary list used for iteration * * btrfs_check_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the @@ -2541,13 +2547,6 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - if (RB_EMPTY_NODE(&upper->rb_node)) { - BUG_ON(!list_empty(&node->upper)); - btrfs_backref_drop_node(cache, node); - node = upper; - node->lowest = 1; - continue; - } /* * Add the node to leaf node list if no other child block * cached. @@ -2624,7 +2623,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, /* Only reloc backref cache cares about a specific root */ if (cache->is_reloc) { root = find_reloc_root(cache->fs_info, cur->bytenr); - if (WARN_ON(!root)) + if (!root) return -ENOENT; cur->root = root; } else { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ff705cc564a9..17abde7f794c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -296,6 +296,9 @@ static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); cache->nr_nodes--; btrfs_put_root(node->root); kfree(node); @@ -340,11 +343,11 @@ static inline void btrfs_backref_drop_node_buffer( static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, struct btrfs_backref_node *node) { - BUG_ON(!list_empty(&node->upper)); + ASSERT(list_empty(&node->upper)); btrfs_backref_drop_node_buffer(node); - list_del(&node->list); - list_del(&node->lower); + list_del_init(&node->list); + list_del_init(&node->lower); if (!RB_EMPTY_NODE(&node->rb_node)) rb_erase(&node->rb_node, &tree->rb_root); btrfs_backref_free_node(tree, node); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 48ebc106a606..5064be59dac5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -15,6 +15,7 @@ #include "delalloc-space.h" #include "discard.h" #include "raid56.h" +#include "zoned.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -724,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; + /* Allocator for zoned filesystems does not use the cache at all */ + if (btrfs_is_zoned(fs_info)) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; @@ -896,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); + btrfs_clear_treelog_bg(block_group); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -1008,12 +1015,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, WARN_ON(block_group->space_info->total_bytes < block_group->length); WARN_ON(block_group->space_info->bytes_readonly - < block_group->length); + < block_group->length - block_group->zone_unusable); + WARN_ON(block_group->space_info->bytes_zone_unusable + < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); } block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= + block_group->zone_unusable; block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1157,7 +1169,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) } num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->used; + cache->bytes_super - cache->zone_unusable - cache->used; /* * Data never overcommits, even in mixed mode, so do just the straight @@ -1188,6 +1200,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (!ret) { sinfo->bytes_readonly += num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes to readonly */ + sinfo->bytes_readonly += cache->zone_unusable; + sinfo->bytes_zone_unusable -= cache->zone_unusable; + cache->zone_unusable = 0; + } cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); } @@ -1262,6 +1280,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip deletion if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + return; + spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { int trimming; @@ -1281,8 +1306,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - mutex_lock(&fs_info->delete_unused_bgs_mutex); - /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); @@ -1371,9 +1394,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -1389,8 +1410,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async; - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); + /* + * DISCARD can flip during remount. On zoned filesystems, we + * need to reset sequential-required zones. + */ + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_is_zoned(fs_info); /* Implicit trim during transaction commit. */ if (trimming) @@ -1428,11 +1453,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans); next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); return; flip_async: @@ -1561,8 +1586,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /** - * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * Map a physical disk address to a list of logical addresses + * + * @fs_info: the filesystem * @chunk_start: logical address of block group + * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1572,9 +1600,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * Used primarily to exclude those portions of a block group that contain super * block copies. */ -EXPORT_FOR_TESTS int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1592,6 +1620,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; + chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -1606,14 +1635,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; u64 stripe_nr; + u64 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; + if (bdev && map->stripes[i].dev->bdev != bdev) + continue; + stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -1627,7 +1660,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size; + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ for (j = 0; j < nr; j++) { @@ -1669,7 +1702,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, + ret = btrfs_rmap_block(fs_info, cache->start, NULL, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -1805,24 +1838,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static void read_block_group_item(struct btrfs_block_group *cache, - struct btrfs_path *path, - const struct btrfs_key *key) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_block_group_item bgi; - int slot = path->slots[0]; - - cache->length = key->offset; - - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); -} - static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_path *path, + struct btrfs_block_group_item *bgi, const struct btrfs_key *key, int need_clear) { @@ -1837,7 +1854,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - read_block_group_item(cache, path, key); + cache->length = key->offset; + cache->used = btrfs_stack_block_group_used(bgi); + cache->flags = btrfs_stack_block_group_flags(bgi); set_free_space_tree_thresholds(cache); @@ -1864,6 +1883,13 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } + ret = btrfs_load_block_group_zone_info(cache, false); + if (ret) { + btrfs_err(info, "zoned: failed to load zone info of bg %llu", + cache->start); + goto error; + } + /* * We need to exclude the super stripes now so that the space info has * super bytes accounted for, otherwise we'll think we have more space @@ -1877,12 +1903,20 @@ static int read_one_block_group(struct btrfs_fs_info *info, } /* - * Check for two cases, either we are full, and therefore don't need - * to bother with the caching work since we won't find any space, or we - * are empty, and we can just add all the space in and be done with it. - * This saves us _a_lot_ of time, particularly in the full case. + * For zoned filesystem, space after the allocation offset is the only + * free space for a block group. So, we don't need any caching work. + * btrfs_calc_zone_unusable() will set the amount of free space and + * zone_unusable space. + * + * For regular filesystem, check for two cases, either we are full, and + * therefore don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all the space + * in and be done with it. This saves us _a_lot_ of time, particularly + * in the full case. */ - if (cache->length == cache->used) { + if (btrfs_is_zoned(info)) { + btrfs_calc_zone_unusable(cache); + } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); @@ -1901,7 +1935,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, } trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, &space_info); + cache->used, cache->bytes_super, + cache->zone_unusable, &space_info); cache->space_info = space_info; @@ -1957,7 +1992,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, &space_info); + 0, 0, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -1996,19 +2031,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { + struct btrfs_block_group_item bgi; + struct extent_buffer *leaf; + int slot; + ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) goto error; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = read_one_block_group(info, path, &key, need_clear); + leaf = path->nodes[0]; + slot = path->slots[0]; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_release_path(path); + ret = read_one_block_group(info, &bgi, &key, need_clear); if (ret < 0) goto error; key.objectid += key.offset; key.offset = 0; - btrfs_release_path(path); } btrfs_release_path(path); @@ -2140,6 +2185,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->cached = BTRFS_CACHE_FINISHED; if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; + + ret = btrfs_load_block_group_zone_info(cache, true); + if (ret) { + btrfs_put_block_group(cache); + return ret; + } + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2181,7 +2233,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); + cache->bytes_super, 0, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2289,8 +2341,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&cache->lock); if (!--cache->ro) { num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - cache->used; + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; sinfo->bytes_readonly -= num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes back */ + cache->zone_unusable = cache->alloc_offset - cache->used; + sinfo->bytes_zone_unusable += cache->zone_unusable; + sinfo->bytes_readonly -= cache->zone_unusable; + } list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); @@ -2564,8 +2623,10 @@ again: if (!path) { path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } } /* @@ -2659,16 +2720,14 @@ again: btrfs_put_block_group(cache); if (drop_reserve) btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be * removed. */ mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + goto out; mutex_lock(&trans->transaction->cache_write_mutex); } mutex_unlock(&trans->transaction->cache_write_mutex); @@ -2692,7 +2751,12 @@ again: goto again; } spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { + } +out: + if (ret < 0) { + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&dirty, &cur_trans->dirty_bgs); + spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } @@ -2896,10 +2960,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, + num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8f74a96074f7..29678426247d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -95,6 +95,8 @@ struct btrfs_block_group { unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; + unsigned int to_copy:1; + unsigned int relocating_repair:1; int disk_cache_state; @@ -181,8 +183,19 @@ struct btrfs_block_group { */ int needs_free_space; + /* Flag indicating this block group is placed on a sequential zone */ + bool seq_zone; + /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; + + /* + * Allocation offset for the block group to implement sequential + * allocation. This is used only on a zoned filesystem. + */ + u64 alloc_offset; + u64 zone_unusable; + u64 meta_write_pointer; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -270,6 +283,9 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, struct btrfs_caching_control *caching_ctl); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -296,9 +312,4 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); -#endif - #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9bf53d9ff90..28e202e89660 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -325,7 +325,8 @@ struct btrfs_dio_private { struct inode *inode; u64 logical_offset; u64 disk_bytenr; - u64 bytes; + /* Used for bio::bi_size */ + u32 bytes; /* * References to this structure. There is one reference per in-flight diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5ae3fa0386b7..6d203acfdeb3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -542,13 +542,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache * for these bytes in the file. But, we have to make * sure they map to this compressed extent on disk. */ - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + + end = last_offset + PAGE_SIZE - 1; lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cc89b63d65a4..d56730a67885 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -221,9 +221,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - - if (ret) + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); return ret; + } btrfs_mark_buffer_dirty(cow); *cow_ret = cow; @@ -1494,6 +1497,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } +ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* * helper function for defrag to decide if two blocks pointed to by a @@ -2821,6 +2825,7 @@ done: btrfs_release_path(p); return ret; } +ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); /* * Like btrfs_search_slot, this looks for a key in the given tree. It uses the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4debdbdde2ab..3bc00aed13b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -298,7 +298,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34) + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -797,7 +798,7 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; - struct percpu_counter dio_bytes; + struct percpu_counter ordered_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; @@ -933,6 +934,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; @@ -974,6 +976,9 @@ struct btrfs_fs_info { /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; @@ -1104,7 +1109,7 @@ struct btrfs_root { u32 type; - u64 highest_objectid; + u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -2740,6 +2745,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, + FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -3100,15 +3106,14 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid); + struct btrfs_root *parent_root); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3119,6 +3124,8 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index bacee09b7bfd..56642ca7af10 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -191,12 +191,14 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, } /** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. + * Release any excessive reservation + * + * @inode: the inode we need to release from + * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup + * meta reservation needs to know if we are freeing qgroup + * reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal + * release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. @@ -361,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * Release a metadata reservation for an inode + * * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. * @qgroup_free: free qgroup reservation or convert it to per-trans reservation @@ -455,11 +458,13 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, } /** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use + * Release data and metadata space for delalloc + * + * @inode: inode we're releasing space for + * @reserved: list of changed/reserved ranges + * @start: start position of the space already reserved + * @len: length of the space already reserved + * @qgroup_free: should qgroup reserved-space also be freed * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 70c0340d839c..ec0b50b8c5d6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1154,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node && (!count || (count && nr--))) { + while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 353cc2994d10..63be7d01a9a3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -69,9 +69,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) } /** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. + * Release a ref head's reservation + * + * @fs_info: the filesystem + * @nr: number of items to drop * * This drops the delayed ref head's count from the delayed refs rsv and frees * any excess reservation we had. @@ -114,10 +115,11 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) } /** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. + * Transfer bytes to our delayed refs rsv + * + * @fs_info: the filesystem + * @src: source block rsv to transfer from + * @num_bytes: number of bytes to transfer * * This transfers up to the num_bytes amount from the src rsv to the * delayed_refs_rsv. Any extra bytes are returned to the space info. @@ -162,9 +164,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, } /** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. + * Refill based on our delayed refs usage + * + * @fs_info: the filesystem + * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. @@ -648,12 +651,12 @@ inserted: */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, - struct btrfs_delayed_ref_head *update, - int *old_ref_mod_ret) + struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -701,8 +704,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; - if (old_ref_mod_ret) - *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; @@ -724,6 +725,27 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates += csum_leaves; } } + + /* + * This handles the following conditions: + * + * 1. We had a ref mod of 0 or more and went negative, indicating that + * we may be freeing space, so add our space to the + * total_bytes_pinned counter. + * 2. We were negative and went to 0 or positive, so no longer can say + * that the space would be pinned, decrement our counter from the + * total_bytes_pinned counter. + * 3. We are now at 0 and have ->must_insert_reserved set, which means + * this was a new allocation and then we dropped it, and thus must + * add our space to the total_bytes_pinned counter. + */ + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); + else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); + else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); + spin_unlock(&existing->lock); } @@ -798,8 +820,7 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret, - int *old_ref_mod, int *new_ref_mod) + int action, int *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -821,8 +842,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(trans, existing, head_ref, - old_ref_mod); + update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -830,14 +850,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - if (old_ref_mod) - *old_ref_mod = 0; + u64 flags = btrfs_ref_head_to_space_flags(head_ref); + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } + if (head_ref->ref_mod < 0) + btrfs_mod_total_bytes_pinned(trans->fs_info, flags, + head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -845,8 +868,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; - if (new_ref_mod) - *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -909,8 +930,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; @@ -977,8 +997,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1006,8 +1025,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod) + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; @@ -1073,8 +1091,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1117,7 +1134,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL, NULL, NULL); + NULL); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 1c977e6d45dc..e22fba272e4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref { u64 offset; }; +enum btrfs_delayed_ref_flags { + /* Indicate that we are flushing delayed refs for the commit */ + BTRFS_DELAYED_REFS_FLUSHING, +}; + struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; @@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root { u64 pending_csums; - /* - * set when the tree is flushing before a transaction commit, - * used by the throttling code to decide if new updates need - * to be run right away - */ - int flushing; + unsigned long flags; u64 run_delayed_start; @@ -326,6 +326,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } +static inline u64 btrfs_ref_head_to_space_flags( + struct btrfs_delayed_ref_head *head_ref) +{ + if (head_ref->is_data) + return BTRFS_BLOCK_GROUP_DATA; + else if (head_ref->is_system) + return BTRFS_BLOCK_GROUP_SYSTEM; + return BTRFS_BLOCK_GROUP_METADATA; +} + static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) { if (refcount_dec_and_test(&head->refs)) @@ -334,12 +344,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod); + u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 324f646d6e5e..3a9c1e046ebe 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" +#include "block-group.h" /* * Device replace overview @@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, + struct btrfs_device *src_dev) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_block_group *cache; + struct btrfs_trans_handle *trans; + int ret = 0; + u64 chunk_offset; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&fs_info->chunk_mutex); + + /* Ensure we don't have pending new block group */ + spin_lock(&fs_info->trans_lock); + while (fs_info->running_transaction && + !list_empty(&fs_info->running_transaction->dev_update_list)) { + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->chunk_mutex); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret == -ENOENT) { + spin_lock(&fs_info->trans_lock); + continue; + } else { + goto unlock; + } + } + + ret = btrfs_commit_transaction(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret) + goto unlock; + + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto unlock; + } + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = src_dev->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto free_path; + if (ret > 0) { + ret = 0; + goto free_path; + } + } else { + ret = 0; + } + } + + while (1) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != src_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!cache) + goto skip; + + spin_lock(&cache->lock); + cache->to_copy = 1; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + +skip: + ret = btrfs_next_item(root, path); + if (ret != 0) { + if (ret > 0) + ret = 0; + break; + } + } + +free_path: + btrfs_free_path(path); +unlock: + mutex_unlock(&fs_info->chunk_mutex); + + return ret; +} + +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map *em; + struct map_lookup *map; + u64 chunk_offset = cache->start; + int num_extents, cur_extent; + int i; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return true; + + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + return true; + } + spin_unlock(&cache->lock); + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(em)); + map = em->map_lookup; + + num_extents = cur_extent = 0; + for (i = 0; i < map->num_stripes; i++) { + /* We have more device extent to copy */ + if (srcdev != map->stripes[i].dev) + continue; + + num_extents++; + if (physical == map->stripes[i].physical) + cur_extent = i; + } + + free_extent_map(em); + + if (num_extents > 1 && cur_extent < num_extents - 1) { + /* + * Has more stripes on this device. Keep this block group + * readonly until we finish all the stripes. + */ + return false; + } + + /* Last stripe on this device */ + spin_lock(&cache->lock); + cache->to_copy = 0; + spin_unlock(&cache->lock); + + return true; +} + static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) @@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; + ret = mark_block_group_to_copy(fs_info, src_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -715,7 +899,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 60b70dacc299..3911049a5f23 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical); #endif diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 2b8383d41144..306ff20af70f 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group( } /** - * peek_discard_list - wrap find_next_block_group() - * @discard_ctl: discard control + * Wrap find_next_block_group() + * + * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management + * @now: time when discard was invoked, in ns * * This wraps find_next_block_group() and sets the block_group to be in use. * discard_state's control flow is managed here. Variables related to diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07a2b4f69b10..41b718cfea40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return 0; found_start = btrfs_header_bytenr(eb); + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON(found_start != 0); + return 0; + } + /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. @@ -591,6 +597,59 @@ out: return ret; } +static int validate_subpage_buffer(struct page *page, u64 start, u64 end, + int mirror) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct extent_buffer *eb; + bool reads_done; + int ret = 0; + + /* + * We don't allow bio merge for subpage metadata read, so we should + * only get one eb for each endio hook. + */ + ASSERT(end == start + fs_info->nodesize - 1); + ASSERT(PagePrivate(page)); + + eb = find_extent_buffer(fs_info, start); + /* + * When we are reading one tree block, eb must have been inserted into + * the radix tree. If not, something is wrong. + */ + ASSERT(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + /* Subpage read must finish in page read */ + ASSERT(reads_done); + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); + if (ret < 0) + goto err; + + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(eb, ret); + + set_extent_buffer_uptodate(eb); + + free_extent_buffer(eb); + return ret; +err: + /* + * end_bio_extent_readpage decrements io_pages in case of error, + * make sure it has something to decrement. + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); + return ret; +} + int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror) @@ -600,6 +659,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, int reads_done; ASSERT(page->private); + + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return validate_subpage_buffer(page, start, end, mirror); + eb = (struct extent_buffer *)page->private; /* @@ -646,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) @@ -808,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, static int check_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { + if (btrfs_is_zoned(fs_info)) + return 0; if (atomic_read(&bi->sync_writers)) return 0; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) @@ -822,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -1016,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->orphan_cleanup_state = 0; root->last_trans = 0; - root->highest_objectid = 0; + root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; @@ -1189,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) @@ -1199,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + /* * DON'T set SHAREABLE bit for log trees. * @@ -1211,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); - if (IS_ERR(leaf)) { - btrfs_put_root(root); - return ERR_CAST(leaf); - } + if (IS_ERR(leaf)) + return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - return root; + + return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -1231,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } + } + WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; @@ -1242,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; @@ -1367,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } mutex_lock(&root->objectid_mutex); - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; } - ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); @@ -1470,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -2427,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info, btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { + + /* + * For 4K page size, we only support 4K sector size. + * For 64K page size, we support read-write for 64K sector size, and + * read-only for 4K sector size. + */ + if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || + (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && + sectorsize != SZ_64K))) { btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", + "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); @@ -2646,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); + ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; } - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -2695,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2804,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -3140,8 +3236,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); - fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); - /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -3194,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + /* For 4K sector size support, it's only read-only */ + if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { + if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + btrfs_err(fs_info, + "subpage sectorsize %u only supported read-only for page size %lu", + sectorsize, PAGE_SIZE); + err = -EINVAL; + goto fail_alloc; + } + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; @@ -3261,6 +3366,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_tree_roots; /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", + ret); + goto fail_block_groups; + } + + /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the @@ -4114,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4166,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } - if (percpu_counter_sum(&fs_info->dio_bytes)) + if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", - percpu_counter_sum(&fs_info->dio_bytes)); + percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -4689,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_redirty_list(cur_trans); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4746,7 +4867,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_init_root_free_objectid(struct btrfs_root *root) { struct btrfs_path *path; int ret; @@ -4770,10 +4891,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } ret = 0; error: @@ -4781,12 +4902,12 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", root->root_key.objectid); @@ -4794,7 +4915,7 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->highest_objectid; + *objectid = root->free_objectid++; ret = 0; out: mutex_unlock(&root->objectid_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e45057c0c016..0e7e9526b6a8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,6 +120,8 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, @@ -133,8 +135,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_init_root_free_objectid(struct btrfs_root *root); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c335dae5af7..78ad31a59e59 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,8 @@ #include "block-group.h" #include "discard.h" #include "rcu-string.h" +#include "zoned.h" +#include "dev-replace.h" #undef SCRAMBLE_DELAYED_REFS @@ -82,41 +84,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) EXTENT_UPTODATE); } -static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) -{ - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - return BTRFS_BLOCK_GROUP_SYSTEM; - else - return BTRFS_BLOCK_GROUP_METADATA; - } - return BTRFS_BLOCK_GROUP_DATA; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -1299,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } +static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +{ + struct btrfs_device *dev = stripe->dev; + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 phys = stripe->physical; + u64 len = stripe->length; + u64 discarded = 0; + int ret = 0; + + /* Zone reset on a zoned filesystem */ + if (btrfs_can_zone_reset(dev, phys, len)) { + u64 src_disc; + + ret = btrfs_reset_device_zone(dev, phys, len, &discarded); + if (ret) + goto out; + + if (!btrfs_dev_replace_is_ongoing(dev_replace) || + dev != dev_replace->srcdev) + goto out; + + src_disc = discarded; + + /* Send to replace target as well */ + ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, + &discarded); + discarded += src_disc; + } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); + } else { + ret = 0; + *bytes = 0; + } + +out: + *bytes = discarded; + return ret; +} + int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { @@ -1333,20 +1340,13 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; - struct request_queue *req_q; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - req_q = bdev_get_queue(stripe->dev->bdev); - if (!blk_queue_discard(req_q)) - continue; - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length, - &bytes); + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; } else if (ret != -EOPNOTSUPP) { @@ -1388,7 +1388,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && @@ -1397,17 +1396,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) - ret = btrfs_add_delayed_tree_ref(trans, generic_ref, - NULL, &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else - ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - sub_pinned_bytes(fs_info, generic_ref); - return ret; } @@ -1795,34 +1789,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, { int nr_items = 1; /* Dropping this ref head update. */ - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; + /* + * We had csum deletions accounted for in our delayed refs rsv, we need + * to drop the csum leaves for this update from our delayed_refs_rsv. + */ + if (head->total_ref_mod < 0 && head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + } - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + /* + * We were dropping refs, or had a new ref and dropped it, and thus must + * adjust down our total_bytes_pinned, the space may or may not have + * been pinned and so is accounted for properly in the pinned space by + * now. + */ + if (head->total_ref_mod < 0 || + (head->total_ref_mod == 0 && head->must_insert_reserved)) { + u64 flags = btrfs_ref_head_to_space_flags(head); - /* - * We had csum deletions accounted for in our delayed refs rsv, - * we need to drop the csum leaves for this update from our - * delayed_refs_rsv. - */ - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, - head->num_bytes); - } + btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); } btrfs_delayed_refs_rsv_release(fs_info, nr_items); @@ -2160,7 +2148,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (count == 0) - count = atomic_read(&delayed_refs->num_entries) * 2; + count = delayed_refs->num_heads_ready; again: #ifdef SCRAMBLE_DELAYED_REFS @@ -2572,8 +2560,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2784,11 +2771,14 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; + } else if (btrfs_is_zoned(fs_info)) { + /* Need reset before reusing in a zoned block group */ + space_info->bytes_zone_unusable += len; + readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space && @@ -3318,7 +3308,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ref generic_ref = { 0 }; - int pin = 1; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3327,13 +3316,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, root->root_key.objectid); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ - pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { @@ -3341,11 +3326,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); - if (!ret) + if (!ret) { + btrfs_redirty_list_add(trans->transaction, buf); goto out; + } } - pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -3354,6 +3340,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + if (btrfs_is_zoned(fs_info)) { + btrfs_redirty_list_add(trans->transaction, buf); + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); @@ -3362,9 +3355,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); } out: - if (pin) - add_pinned_bytes(fs_info, &generic_ref); - if (last_ref) { /* * Deleting the buffer, clear the corrupt flag since it doesn't @@ -3378,7 +3368,6 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) @@ -3394,14 +3383,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); - old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { - ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { - ret = btrfs_add_delayed_data_ref(trans, ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (!((ref->type == BTRFS_REF_METADATA && @@ -3410,9 +3396,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref); - return ret; } @@ -3489,6 +3472,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache, enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, }; /* @@ -3513,6 +3497,9 @@ struct find_free_extent_ctl { bool have_caching_bg; bool orig_have_caching_bg; + /* Allocation is called for tree-log */ + bool for_treelog; + /* RAID index, converted from flags */ int index; @@ -3741,6 +3728,118 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, return find_free_extent_unclustered(block_group, ffe_ctl); } +/* + * Tree-log block group locking + * ============================ + * + * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which + * indicates the starting address of a block group, which is reserved only + * for tree-log metadata. + * + * Lock nesting + * ============ + * + * space_info::lock + * block_group::lock + * fs_info::treelog_bg_lock + */ + +/* + * Simple allocator for sequential-only block group. It only allows sequential + * allocation. No need to play with trees. This function also reserves the + * bytes as in btrfs_add_reserved_bytes. + */ +static int do_allocation_zoned(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 start = block_group->start; + u64 num_bytes = ffe_ctl->num_bytes; + u64 avail; + u64 bytenr = block_group->start; + u64 log_bytenr; + int ret = 0; + bool skip; + + ASSERT(btrfs_is_zoned(block_group->fs_info)); + + /* + * Do not allow non-tree-log blocks in the dedicated tree-log block + * group, and vice versa. + */ + spin_lock(&fs_info->treelog_bg_lock); + log_bytenr = fs_info->treelog_bg; + skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + spin_unlock(&fs_info->treelog_bg_lock); + if (skip) + return 1; + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); + + if (block_group->ro) { + ret = 1; + goto out; + } + + /* + * Do not allow currently using block group to be tree-log dedicated + * block group. + */ + if (ffe_ctl->for_treelog && !fs_info->treelog_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + avail = block_group->length - block_group->alloc_offset; + if (avail < num_bytes) { + if (ffe_ctl->max_extent_size < avail) { + /* + * With sequential allocator, free space is always + * contiguous + */ + ffe_ctl->max_extent_size = avail; + ffe_ctl->total_free_space = avail; + } + ret = 1; + goto out; + } + + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; + spin_lock(&ctl->tree_lock); + ctl->free_space -= num_bytes; + spin_unlock(&ctl->tree_lock); + + /* + * We do not check if found_offset is aligned to stripesize. The + * address is anyway rewritten when using zone append writing. + */ + + ffe_ctl->search_start = ffe_ctl->found_offset; + +out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + return ret; +} + static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) @@ -3748,6 +3847,8 @@ static int do_allocation(struct btrfs_block_group *block_group, switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + case BTRFS_EXTENT_ALLOC_ZONED: + return do_allocation_zoned(block_group, ffe_ctl, bg_ret); default: BUG(); } @@ -3762,6 +3863,9 @@ static void release_block_group(struct btrfs_block_group *block_group, ffe_ctl->retry_clustered = false; ffe_ctl->retry_unclustered = false; break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3790,6 +3894,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, case BTRFS_EXTENT_ALLOC_CLUSTERED: found_extent_clustered(ffe_ctl, ins); break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3805,6 +3912,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) */ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Give up here */ + return -ENOSPC; default: BUG(); } @@ -3973,6 +4083,14 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, case BTRFS_EXTENT_ALLOC_CLUSTERED: return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } + return 0; default: BUG(); } @@ -4015,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root, struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); WARN_ON(num_bytes < fs_info->sectorsize); @@ -4028,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.for_treelog = for_treelog; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; /* For clustered allocation */ @@ -4036,6 +4156,9 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.last_ptr = NULL; ffe_ctl.use_cluster = true; + if (btrfs_is_zoned(fs_info)) + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -4099,8 +4222,11 @@ search: struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) + if (unlikely(block_group->ro)) { + if (for_treelog) + btrfs_clear_treelog_bg(block_group); continue; + } btrfs_grab_block_group(block_group, delalloc); ffe_ctl.search_start = block_group->start; @@ -4178,20 +4304,21 @@ have_block_group: /* move on to the next group */ if (ffe_ctl.search_start + num_bytes > block_group->start + block_group->length) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } if (ffe_ctl.found_offset < ffe_ctl.search_start) - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); @@ -4285,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); flags = get_alloc_profile_by_root(root, is_data); again: @@ -4308,8 +4436,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu", - flags, num_bytes); + "allocation failed flags %llu, wanted %llu tree-log %d", + flags, num_bytes, for_treelog); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); @@ -4491,7 +4619,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); @@ -4528,7 +4655,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; - int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -4536,9 +4662,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ins->objectid, ins->offset, 0); btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); btrfs_ref_tree_mod(root->fs_info, &generic_ref); - ret = btrfs_add_delayed_data_ref(trans, &generic_ref, - ram_bytes, NULL, NULL); - return ret; + + return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* @@ -4620,6 +4745,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); set_extent_buffer_uptodate(buf); @@ -4730,8 +4856,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, generic_ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&generic_ref, level, root_objectid); btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, - extent_op, NULL, NULL); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) goto out_free_delayed; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9cee458e001..4dfb3ead1175 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -24,6 +24,9 @@ #include "rcu-string.h" #include "backref.h" #include "disk-io.h" +#include "subpage.h" +#include "zoned.h" +#include "block-group.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -389,16 +392,16 @@ do_insert: } /** - * __etree_search - searche @tree for an entry that contains @offset. Such - * entry would have entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. * - * @tree - the tree to search - * @offset - offset that should fall within an entry in @tree - * @next_ret - pointer to the first entry whose range ends after @offset - * @prev - pointer to the first entry whose range begins before @offset - * @p_ret - pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret - points to entry which would have been the parent of the entry, + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * @p_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * * This function returns a pointer to the entry that contains @offset byte @@ -1588,12 +1591,13 @@ out: } /** - * find_contiguous_extent_bit: find a contiguous area of bits - * @tree - io tree to check - * @start - offset to start the search from - * @start_ret - the first offset we found with the bits set - * @end_ret - the final contiguous range of the bits that were set - * @bits - bits to look for + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for * * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges * to set bits appropriately, and then merge them again. During this time it @@ -1625,14 +1629,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, } /** - * find_first_clear_extent_bit - find the first range that has @bits not set. - * This range could start before @start. + * Find the first range that has @bits not set. This range could start before + * @start. * - * @tree - the tree to search - * @start - the offset at/after which the found extent should start - * @start_ret - records the beginning of the range - * @end_ret - records the end of the range (inclusive) - * @bits - the set of bits which must be unset + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset * * Since unallocated range is also considered one which doesn't have the bits * set it's possible that @end_ret contains -1, this happens in case the range @@ -1975,10 +1979,10 @@ static int __process_pages_contig(struct address_space *mapping, pages_processed++; continue; } - if (page_ops & PAGE_CLEAR_DIRTY) + if (page_ops & PAGE_START_WRITEBACK) { clear_page_dirty_for_io(pages[i]); - if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + } if (page_ops & PAGE_SET_ERROR) SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) @@ -2256,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + if (btrfs_is_zoned(fs_info)) + return btrfs_repair_one_zone(fs_info, logical); + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; @@ -2732,6 +2739,7 @@ static void end_bio_extent_writepage(struct bio *bio) u64 start; u64 end; struct bvec_iter_all iter_all; + bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -2758,6 +2766,11 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; + if (first_bvec) { + btrfs_record_physical_zoned(inode, start, bio); + first_bvec = false; + } + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2775,7 +2788,7 @@ struct processed_extent { struct btrfs_inode *inode; /* Start of the range in @inode */ u64 start; - /* End of the range in in @inode */ + /* End of the range in @inode */ u64 end; bool uptodate; }; @@ -2838,15 +2851,38 @@ update: processed->uptodate = uptodate; } -static void endio_readpage_update_page_status(struct page *page, bool uptodate) +static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { + ASSERT(PageLocked(page)); + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page)); + btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + if (uptodate) { - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, start, len); } else { - ClearPageUptodate(page); - SetPageError(page); + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); } - unlock_page(page); + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + else if (is_data_inode(page->mapping->host)) + /* + * For subpage data, unlock the page if we're the last reader. + * For subpage metadata, page lock is not utilized for read. + */ + btrfs_subpage_end_reader(fs_info, page, start, len); } /* @@ -2983,7 +3019,7 @@ readpage_ok: bio_offset += len; /* Update page status and unlock */ - endio_readpage_update_page_status(page, uptodate); + end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } @@ -3058,14 +3094,67 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } +/** + * Attempt to add a page to bio + * + * @bio: destination bio + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @pg_offset: starting offset in the page + * @size: portion of page that we want to write + * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @bio_flags: flags of the current bio to see if we can merge them + * @return: true if page was added, false otherwise + * + * Attempt to add a page to bio considering stripe alignment etc. + * + * Return true if successfully page added. Otherwise, return false. + */ +static bool btrfs_bio_add_page(struct bio *bio, struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + bool contig; + int ret; + + if (prev_bio_flags != bio_flags) + return false; + + if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + contig = bio->bi_iter.bi_sector == sector; + else + contig = bio_end_sector(bio) == sector; + if (!contig) + return false; + + if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + return false; + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *first_page = bio_first_bvec_all(bio)->bv_page; + + if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) + return false; + ret = bio_add_zone_append_page(bio, page, size, pg_offset); + } else { + ret = bio_add_page(bio, page, size, pg_offset); + } + + return ret == size; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting * @page: page to add to the bio + * @disk_bytenr: logical bytenr where the write will be + * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @size: portion of page that we want to write - * @offset: starting offset in the page * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write @@ -3074,7 +3163,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, - struct page *page, u64 offset, + struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -3086,27 +3175,17 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - sector_t sector = offset >> 9; - struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(bio_ret); if (*bio_ret) { - bool contig; - bool can_merge = true; - bio = *bio_ret; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) - contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; - - if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) - can_merge = false; - - if (prev_bio_flags != bio_flags || !contig || !can_merge || - force_bio_submit || - bio_add_page(bio, page, io_size, pg_offset) < io_size) { + if (force_bio_submit || + !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, + pg_offset, prev_bio_flags, bio_flags)) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -3120,7 +3199,7 @@ static int submit_extent_page(unsigned int opf, } } - bio = btrfs_bio_alloc(offset); + bio = btrfs_bio_alloc(disk_bytenr); bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; @@ -3129,20 +3208,39 @@ static int submit_extent_page(unsigned int opf, if (wbc) { struct block_device *bdev; - bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, page, io_size); } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + btrfs_io_bio(bio)->device = map->stripes[0].dev; + + free_extent_map(em); + } *bio_ret = bio; return ret; } -static void attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page) +static int attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page, + struct btrfs_subpage *prealloc) { + struct btrfs_fs_info *fs_info = eb->fs_info; + int ret = 0; + /* * If the page is mapped to btree inode, we should hold the private * lock to prevent race. @@ -3152,16 +3250,62 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (fs_info->sectorsize == PAGE_SIZE) { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else + WARN_ON(page->private != (unsigned long)eb); + return 0; + } + + /* Already mapped, just free prealloc */ + if (PagePrivate(page)) { + btrfs_free_subpage(prealloc); + return 0; + } + + if (prealloc) + /* Has preallocated memory for subpage */ + attach_page_private(page, prealloc); else - WARN_ON(page->private != (unsigned long)eb); + /* Do new allocation to attach subpage */ + ret = btrfs_attach_subpage(fs_info, page, + BTRFS_SUBPAGE_METADATA); + return ret; +} + +int set_page_extent_mapped(struct page *page) +{ + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + + if (PagePrivate(page)) + return 0; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return 0; } -void set_page_extent_mapped(struct page *page) +void clear_page_extent_mapped(struct page *page) { + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + if (!PagePrivate(page)) - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_detach_subpage(fs_info, page); + + detach_page_private(page); } static struct extent_map * @@ -3202,6 +3346,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = page_offset(page); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3218,12 +3363,19 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_extent(tree, start, end); + btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); + unlock_page(page); + goto out; + } if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); unlock_extent(tree, start, end); + unlock_page(page); goto out; } } @@ -3240,9 +3392,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, kunmap_atomic(userpage); } } + begin_page_read(fs_info, page); while (cur <= end) { bool force_bio_submit = false; - u64 offset; + u64 disk_bytenr; if (cur >= last_byte) { char *userpage; @@ -3257,13 +3410,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); break; } extent_offset = cur - em->start; @@ -3280,9 +3434,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) - offset = em->block_start; + disk_bytenr = em->block_start; else - offset = em->block_start + extent_offset; + disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3346,6 +3500,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3355,6 +3510,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3363,15 +3519,15 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to date. Error out */ if (block_start == EXTENT_MAP_INLINE) { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, iosize, + page, disk_bytenr, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, @@ -3381,19 +3537,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, nr++; *bio_flags = this_bio_flag; } else { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); goto out; } cur = cur + iosize; pg_offset += iosize; } out: - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } return ret; } @@ -3513,23 +3664,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, unsigned long nr_written, int *nr_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; - u64 end; + u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; u64 block_start; - u64 iosize; struct extent_map *em; - size_t pg_offset = 0; - size_t blocksize; int ret = 0; int nr = 0; + u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, page_end); + ret = btrfs_writepage_cow_fixup(page, start, end); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3544,16 +3693,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ update_nr_written(wbc, nr_written + 1); - end = page_end; - blocksize = inode->vfs_inode.i_sb->s_blocksize; - while (cur <= end) { + u64 disk_bytenr; u64 em_end; - u64 offset; + u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, - page_end, 1); + btrfs_writepage_endio_finish_ordered(page, cur, end, 1); break; } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); @@ -3565,13 +3711,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, extent_offset = cur - em->start; em_end = extent_map_end(em); - BUG_ON(em_end <= cur); - BUG_ON(end < cur); - iosize = min(em_end - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); - offset = em->block_start + extent_offset; + ASSERT(cur <= em_end); + ASSERT(cur < end); + ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + disk_bytenr = em->block_start + extent_offset; + + /* Note that em_end from extent_map_end() is exclusive */ + iosize = min(em_end, end + 1) - cur; + + if (btrfs_use_zone_append(inode, em)) + opf = REQ_OP_ZONE_APPEND; + free_extent_map(em); em = NULL; @@ -3587,7 +3740,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_writepage_endio_finish_ordered(page, cur, cur + iosize - 1, 1); cur += iosize; - pg_offset += iosize; continue; } @@ -3598,9 +3750,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, offset, iosize, pg_offset, - &epd->bio, + ret = submit_extent_page(opf | write_flags, wbc, page, + disk_bytenr, iosize, + cur - page_offset(page), &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3609,8 +3761,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_page_writeback(page); } - cur = cur + iosize; - pg_offset += iosize; + cur += iosize; nr++; } *nr_ret = nr; @@ -3663,7 +3814,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + SetPageError(page); + goto done; + } if (!epd->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, @@ -3923,7 +4078,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - u64 offset = eb->start; + u64 disk_bytenr = eb->start; u32 nritems; int i, num_pages; unsigned long start, end; @@ -3956,7 +4111,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, offset, PAGE_SIZE, 0, + p, disk_bytenr, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -3969,7 +4124,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = -EIO; break; } - offset += PAGE_SIZE; + disk_bytenr += PAGE_SIZE; update_nr_written(wbc, 1); unlock_page(p); } @@ -4010,6 +4165,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; + struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -4042,13 +4198,31 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; + if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { + /* + * If for_sync, this hole will be filled with + * trasnsaction commit. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + ret = -EAGAIN; + else + ret = 0; + free_extent_buffer(eb); + return ret; + } + *eb_context = eb; ret = lock_extent_buffer_for_io(eb, epd); if (ret <= 0) { + btrfs_revert_meta_write_pointer(cache, eb); + if (cache) + btrfs_put_block_group(cache); free_extent_buffer(eb); return ret; } + if (cache) + btrfs_put_block_group(cache); ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4094,6 +4268,7 @@ int btree_write_cache_pages(struct address_space *mapping, tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); @@ -4134,7 +4309,7 @@ retry: } if (ret < 0) { end_write_bio(&epd, ret); - return ret; + goto out; } /* * If something went wrong, don't allow any metadata write bio to be @@ -4169,14 +4344,17 @@ retry: ret = -EROFS; end_write_bio(&epd, ret); } +out: + btrfs_zoned_meta_io_unlock(fs_info); return ret; } /** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * Walk the list of dirty pages of the given address space and write all of them. + * * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @data: data passed to __extent_writepage function + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @epd: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -4975,25 +5153,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -/* - * Release all pages attached to the extent buffer. - */ -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { - int i; - int num_pages; - int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + struct btrfs_subpage *subpage; - BUG_ON(extent_buffer_under_io(eb)); + lockdep_assert_held(&page->mapping->private_lock); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + if (PagePrivate(page)) { + subpage = (struct btrfs_subpage *)page->private; + if (atomic_read(&subpage->eb_refs)) + return true; + } + return false; +} - if (!page) - continue; +static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + /* + * For mapped eb, we're going to change the page private, which should + * be done under the private_lock. + */ + if (mapped) + spin_lock(&page->mapping->private_lock); + + if (!PagePrivate(page)) { if (mapped) - spin_lock(&page->mapping->private_lock); + spin_unlock(&page->mapping->private_lock); + return; + } + + if (fs_info->sectorsize == PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5012,9 +5204,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) */ detach_page_private(page); } - if (mapped) spin_unlock(&page->mapping->private_lock); + return; + } + + /* + * For subpage, we can have dummy eb with page private. In this case, + * we can directly detach the private as such page is only attached to + * one dummy eb, no sharing. + */ + if (!mapped) { + btrfs_detach_subpage(fs_info, page); + return; + } + + btrfs_page_dec_eb_refs(fs_info, page); + + /* + * We can only detach the page private if there are no other ebs in the + * page range. + */ + if (!page_range_has_eb(fs_info, page)) + btrfs_detach_subpage(fs_info, page); + + spin_unlock(&page->mapping->private_lock); +} + +/* Release all pages attached to the extent buffer */ +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +{ + int i; + int num_pages; + + ASSERT(!extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) + continue; + + detach_extent_buffer_page(eb, page); /* One for when we allocated the page */ put_page(page); @@ -5046,6 +5278,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); + INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -5067,21 +5300,32 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) if (new == NULL) return NULL; + /* + * Set UNMAPPED before calling btrfs_release_extent_buffer(), as + * btrfs_release_extent_buffer() have different behavior for + * UNMAPPED subpage extent buffer. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + for (i = 0; i < num_pages; i++) { + int ret; + p = alloc_page(GFP_NOFS); if (!p) { btrfs_release_extent_buffer(new); return NULL; } - attach_extent_buffer_page(new, p); + ret = attach_extent_buffer_page(new, p, NULL); + if (ret < 0) { + put_page(p); + btrfs_release_extent_buffer(new); + return NULL; + } WARN_ON(PageDirty(p)); - SetPageUptodate(p); new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } - - set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); - set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + set_extent_buffer_uptodate(new); return new; } @@ -5099,9 +5343,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { + int ret; + eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) goto err; + ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + if (ret < 0) + goto err; } set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); @@ -5109,8 +5358,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (; i > 0; i--) + for (; i > 0; i--) { + detach_extent_buffer_page(eb, eb->pages[i - 1]); __free_page(eb->pages[i - 1]); + } __free_extent_buffer(eb); return NULL; } @@ -5252,6 +5503,38 @@ free_eb: } #endif +static struct extent_buffer *grab_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page) +{ + struct extent_buffer *exists; + + /* + * For subpage case, we completely rely on radix tree to ensure we + * don't try to insert two ebs for the same bytenr. So here we always + * return NULL and just continue. + */ + if (fs_info->sectorsize < PAGE_SIZE) + return NULL; + + /* Page not yet attached to an extent buffer */ + if (!PagePrivate(page)) + return NULL; + + /* + * We could have already allocated an eb for this page and attached one + * so lets see if we can get a ref on the existing eb, and if we can we + * know it's good and we can just return that one, else we know we can + * just overwrite page->private. + */ + exists = (struct extent_buffer *)page->private; + if (atomic_inc_not_zero(&exists->refs)) + return exists; + + WARN_ON(PageDirty(page)); + detach_page_private(page); + return NULL; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -5290,36 +5573,58 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { + struct btrfs_subpage *prealloc = NULL; + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); goto free_eb; } - spin_lock(&mapping->private_lock); - if (PagePrivate(p)) { - /* - * We could have already allocated an eb for this page - * and attached one so lets see if we can get a ref on - * the existing eb, and if we can we know it's good and - * we can just return that one, else we know we can just - * overwrite page->private. - */ - exists = (struct extent_buffer *)p->private; - if (atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - goto free_eb; - } - exists = NULL; + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock hold. The memory will be + * freed by attach_extent_buffer_page() or freed manually if + * we exit earlier. + * + * Although we have ensured one subpage eb can only have one + * page, but it may change in the future for 16K page size + * support, so we still preallocate the memory in the loop. + */ + ret = btrfs_alloc_subpage(fs_info, &prealloc, + BTRFS_SUBPAGE_METADATA); + if (ret < 0) { + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } - WARN_ON(PageDirty(p)); - detach_page_private(p); + spin_lock(&mapping->private_lock); + exists = grab_extent_buffer(fs_info, p); + if (exists) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + put_page(p); + mark_extent_buffer_accessed(exists, p); + btrfs_free_subpage(prealloc); + goto free_eb; } - attach_extent_buffer_page(eb, p); + /* Should not fail, as we have preallocated the memory */ + ret = attach_extent_buffer_page(eb, p, prealloc); + ASSERT(!ret); + /* + * To inform we have extra eb under allocation, so that + * detach_extent_buffer_page() won't release the page private + * when the eb hasn't yet been inserted into radix tree. + * + * The ref will be decreased when the eb released the page, in + * detach_extent_buffer_page(). + * Thus needs no special handling in error path. + */ + btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); eb->pages[i] = p; if (!PageUptodate(p)) @@ -5525,31 +5830,101 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (page) - ClearPageUptodate(page); + btrfs_page_clear_uptodate(fs_info, page, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + } +} + +static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct extent_io_tree *io_tree; + struct page *page = eb->pages[0]; + struct bio *bio = NULL; + int ret = 0; + + ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); + ASSERT(PagePrivate(page)); + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + + if (wait == WAIT_NONE) { + ret = try_lock_extent(io_tree, eb->start, + eb->start + eb->len - 1); + if (ret <= 0) + return ret; + } else { + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + if (ret < 0) + return ret; + } + + ret = 0; + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || + PageUptodate(page) || + btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + return ret; + } + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, 1); + check_buffer_tree_ref(eb); + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, + eb->len, eb->start - page_offset(page), &bio, + end_bio_extent_readpage, mirror_num, 0, 0, + true); + if (ret) { + /* + * In the endio function, if we hit something wrong we will + * increase the io_pages, so here we need to decrease it for + * error path. + */ + atomic_dec(&eb->io_pages); + } + if (bio) { + int tmp; + + tmp = submit_one_bio(bio, mirror_num, 0); + if (tmp < 0) + return tmp; } + if (ret || wait != WAIT_COMPLETE) + return ret; + + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + ret = -EIO; + return ret; } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) @@ -5568,10 +5943,20 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return read_extent_buffer_subpage(eb, wait, mirror_num); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (wait == WAIT_NONE) { + /* + * WAIT_NONE is only utilized by readahead. If we can't + * acquire the lock atomically it means either the eb + * is being read out or under modification. + * Either way the eb will be or has been cached, + * readahead can exit safely. + */ if (!trylock_page(page)) goto unlock_exit; } else { @@ -5823,6 +6208,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); + if (check_eb_range(eb, start, len)) return; @@ -6169,13 +6556,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +static struct extent_buffer *get_next_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *found = NULL; + u64 page_start = page_offset(page); + int ret; + int i; + + ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); + lockdep_assert_held(&fs_info->buffer_lock); + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, + bytenr >> fs_info->sectorsize_bits, + PAGE_SIZE / fs_info->nodesize); + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + break; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + break; + } + } + return found; +} + +static int try_release_subpage_extent_buffer(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + u64 cur = page_offset(page); + const u64 end = page_offset(page) + PAGE_SIZE; + int ret; + + while (cur < end) { + struct extent_buffer *eb = NULL; + + /* + * Unlike try_release_extent_buffer() which uses page->private + * to grab buffer, for subpage case we rely on radix tree, thus + * we need to ensure radix tree consistency. + * + * We also want an atomic snapshot of the radix tree, thus go + * with spinlock rather than RCU. + */ + spin_lock(&fs_info->buffer_lock); + eb = get_next_extent_buffer(fs_info, page, cur); + if (!eb) { + /* No more eb in the page range after or at cur */ + spin_unlock(&fs_info->buffer_lock); + break; + } + cur = eb->start + eb->len; + + /* + * The same as try_release_extent_buffer(), to ensure the eb + * won't disappear out from under us. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&fs_info->buffer_lock); + break; + } + spin_unlock(&fs_info->buffer_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a + * real ref, so just return, this eb will likely be freed soon + * anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + break; + } + + /* + * Here we don't care about the return value, we will always + * check the page private at the end. And + * release_extent_buffer() will release the refs_lock. + */ + release_extent_buffer(eb); + } + /* + * Finally to check if we have cleared page private, as if we have + * released all ebs in the page, the page private should be cleared now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) + ret = 1; + else + ret = 0; + spin_unlock(&page->mapping->private_lock); + return ret; + +} + int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return try_release_subpage_extent_buffer(page); + /* - * We need to make sure nobody is attaching this page to an eb right - * now. + * We need to make sure nobody is changing page->private, as we rely on + * page->private as the pointer to extent buffer. */ spin_lock(&page->mapping->private_lock); if (!PagePrivate(page)) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19221095c635..824640cb0ace 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -31,16 +31,17 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, + EXTENT_BUFFER_NO_CHECK, }; /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) -#define PAGE_CLEAR_DIRTY (1 << 1) -#define PAGE_SET_WRITEBACK (1 << 2) -#define PAGE_END_WRITEBACK (1 << 3) -#define PAGE_SET_PRIVATE2 (1 << 4) -#define PAGE_SET_ERROR (1 << 5) -#define PAGE_LOCK (1 << 6) +/* Page starts writeback, clear dirty bit and set writeback bit */ +#define PAGE_START_WRITEBACK (1 << 1) +#define PAGE_END_WRITEBACK (1 << 2) +#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ERROR (1 << 4) +#define PAGE_LOCK (1 << 5) /* * page->private values. Every page that is controlled by the extent @@ -93,6 +94,7 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif @@ -178,7 +180,8 @@ int btree_write_cache_pages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -void set_page_extent_mapped(struct page *page); +int set_page_extent_mapped(struct page *page); +void clear_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bd6229fb2b6f..4a8e02f7b6c7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -385,9 +385,12 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } /** - * add_extent_mapping - add new extent map to the extent tree + * Add new extent map to the extent tree + * * @tree: tree to insert new map in * @em: map to insert + * @modified: indicate whether the given @em should be added to the + * modified list, which indicates the extent needs to be logged * * Insert @em into @tree or perform a simple forward/backward merge with * existing mappings. The extent_map struct passed in will be inserted @@ -574,12 +577,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, } /** - * btrfs_add_extent_mapping - add extent mapping into em_tree - * @fs_info - used for tracepoint - * @em_tree - the extent tree into which we want to insert the extent mapping - * @em_in - extent we are inserting - * @start - start of the logical range btrfs_get_extent() is requesting - * @len - length of the logical range btrfs_get_extent() is requesting + * Add extent mapping into em_tree + * + * @fs_info: the filesystem + * @em_tree: extent tree into which we want to insert the extent mapping + * @em_in: extent we are inserting + * @start: start of the logical range btrfs_get_extent() is requesting + * @len: length of the logical range btrfs_get_extent() is requesting * * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6ccfc019ad90..47cd3a6dc635 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -24,8 +24,10 @@ PAGE_SIZE)) /** - * @inode - the inode we want to update the disk_i_size for - * @new_i_size - the i_size we want to set to, 0 if we use i_size + * Set inode's size according to filesystem options + * + * @inode: inode we want to update the disk_i_size for + * @new_i_size: i_size we want to set to, 0 if we use i_size * * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() * returns as it is perfectly fine with a file that has holes without hole file @@ -62,9 +64,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Mark range within a file as having a new extent inserted + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Call when we are inserting a new file extent where there was none before. * Does not need to call this in the case where we're replacing an existing file @@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Marks an inode range as not having a backing extent + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Called when we drop a file extent, for example when we truncate. Doesn't * need to be called for cases where we're replacing a file extent, like when diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e41459b8de6..01a72f53fb5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -453,12 +453,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } /* - * after copy_from_user, pages need to be dirtied and we need to make - * sure holes are created between the current EOF and the start of - * any next extents (if required). - * - * this also makes the decision about creating an inline extent vs - * doing real data extents, marking pages dirty and delalloc as required. + * After btrfs_copy_from_user(), update the following things for delalloc: + * - Mark newly dirtied pages as DELALLOC in the io tree. + * Used to advise which range is to be written back. + * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Update inode size for past EOF write */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -1370,6 +1369,12 @@ again: goto fail; } + err = set_page_extent_mapped(pages[i]); + if (err < 0) { + faili = i; + goto fail; + } + if (i == 0) err = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); @@ -1454,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * It's possible the pages are dirty right now, but we don't want - * to clean them yet because copy_from_user may catch a page fault - * and we might have to fall back to one page at a time. If that - * happens, we'll unlock these pages and we'd have a window where - * reclaim could sneak in and drop the once-dirty page on the floor - * without writing it. - * - * We have the pages locked and the extent range locked, so there's - * no way someone can start IO on any dirty pages in this range. - * - * We'll call btrfs_dirty_pages() later on, and that will flip around - * delalloc bits and dirty the pages as required. + * We should be called after prepare_pages() which should have locked + * all pages in the range. */ - for (i = 0; i < num_pages; i++) { - set_page_extent_mapped(pages[i]); + for (i = 0; i < num_pages; i++) WARN_ON(!PageLocked(pages[i])); - } return ret; } @@ -1997,9 +1990,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; @@ -2008,7 +1999,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2016,7 +2007,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; if (sync) - atomic_inc(&BTRFS_I(inode)->sync_writers); + atomic_inc(&inode->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) num_written = btrfs_direct_write(iocb, from); @@ -2028,14 +2019,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occurred. */ - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_sub_trans = root->log_transid; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); if (num_written > 0) num_written = generic_write_sync(iocb, num_written); if (sync) - atomic_dec(&BTRFS_I(inode)->sync_writers); + atomic_dec(&inode->sync_writers); current->backing_dev_info = NULL; return num_written; @@ -2177,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. + * + * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the + * logical address recorded in the ordered extent may change. We need + * to wait for the IO to stabilize the logical address. */ - if (full_sync) { + if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); } else { /* @@ -2241,6 +2236,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = PTR_ERR(trans); goto out_release_extents; } + trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); btrfs_release_log_ctx_extents(&ctx); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4d8897879c9c..5400294bd271 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -198,7 +198,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, int ret; u64 ino; - ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); + ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; @@ -431,11 +431,22 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) int i; for (i = 0; i < io_ctl->num_pages; i++) { + int ret; + page = find_or_create_page(inode->i_mapping, i, mask); if (!page) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + io_ctl_drop_pages(io_ctl); + return ret; + } + io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -455,10 +466,8 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) } } - for (i = 0; i < io_ctl->num_pages; i++) { + for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); - set_page_extent_mapped(io_ctl->pages[i]); - } return 0; } @@ -775,8 +784,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!e) + if (!e) { + ret = -ENOMEM; goto free_cache; + } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { @@ -785,6 +796,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } if (!e->bytes) { + ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -805,6 +817,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { + ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; @@ -1295,11 +1308,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, } /** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle + * Write out cached info to an inode + * + * @root: root the inode belongs to + * @inode: freespace inode we are writing out + * @ctl: free space cache we are going to write out + * @block_group: block_group for this cache if it belongs to a block_group + * @io_ctl: holds context for the io + * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, @@ -2461,6 +2477,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, int ret = 0; u64 filter_bytes = bytes; + ASSERT(!btrfs_is_zoned(fs_info)); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -2518,11 +2536,49 @@ out: return ret; } +static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + + spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; + else if (offset >= block_group->alloc_offset) + to_free = size; + else if (offset + size <= block_group->alloc_offset) + to_free = 0; + else + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + + ctl->free_space += to_free; + block_group->zone_unusable += to_unusable; + spin_unlock(&ctl->tree_lock); + if (!used) { + spin_lock(&block_group->lock); + block_group->alloc_offset -= size; + spin_unlock(&block_group->lock); + } + + /* All the region is now unusable. Mark it as unused and reclaim */ + if (block_group->zone_unusable == block_group->length) + btrfs_mark_bg_unused(block_group); + + return 0; +} + int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2531,6 +2587,16 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, bytenr, size, trim_state); } +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + false); + + return btrfs_add_free_space(block_group, bytenr, size); +} + /* * This is a subtle distinction because when adding free space back in general, * we want it to be added as untrimmed for async. But in the case where we add @@ -2541,6 +2607,10 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2558,6 +2628,23 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; + if (btrfs_is_zoned(block_group->fs_info)) { + /* + * This can happen with conventional zones when replaying log. + * Since the allocation info of tree-log nodes are not recorded + * to the extent-tree, calculate_alloc_pointer() failed to + * advance the allocation pointer after last allocated tree log + * node blocks. + * + * This function is called from + * btrfs_pin_extent_for_log_replay() when replaying the log. + * Advance the pointer not to overwrite the tree-log nodes. + */ + if (block_group->alloc_offset < offset + bytes) + block_group->alloc_offset = offset + bytes; + return 0; + } + spin_lock(&ctl->tree_lock); again: @@ -2652,6 +2739,16 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, struct rb_node *n; int count = 0; + /* + * Zoned btrfs does not use free space tree and cluster. Just print + * out the free space after the allocation offset. + */ + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, "free space %llu", + block_group->length - block_group->alloc_offset); + return; + } + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); @@ -2845,6 +2942,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, block_group->full_stripe_len, max_extent_size); @@ -2976,6 +3075,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct rb_node *node; u64 ret = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -3752,6 +3853,8 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, int ret; u64 rem = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + *trimmed = 0; spin_lock(&block_group->lock); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ecb09a02d544..1f23088d43f9 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -107,6 +107,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8e0a6b038d3..535abf898225 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" struct btrfs_iget_args { u64 ino; @@ -692,8 +693,7 @@ cont: NULL, clear_flags, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); @@ -917,7 +917,6 @@ retry: ins.objectid, async_extent->ram_size, ins.offset, - BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, async_extent->start, @@ -934,8 +933,7 @@ retry: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK); + PAGE_UNLOCK | PAGE_START_WRITEBACK); if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, @@ -971,9 +969,8 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR); + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); goto again; @@ -1071,8 +1068,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK); + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1127,7 +1123,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, 0); + ram_size, cur_alloc_size, + BTRFS_ORDERED_REGULAR); if (ret) goto out_drop_extent_cache; @@ -1194,8 +1191,7 @@ out_reserve: out_unlock: clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * If we reserved an extent for our delalloc range (or a subrange) and * failed to create the respective ordered extent, then it means that @@ -1320,9 +1316,8 @@ static int cow_file_range_async(struct btrfs_inode *inode, unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR; + unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); @@ -1399,6 +1394,29 @@ static int cow_file_range_async(struct btrfs_inode *inode, return 0; } +static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0); + if (ret) + return ret; + + if (*page_started) + return 0; + + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + *page_started = 1; + + return 0; +} + static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { @@ -1519,8 +1537,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return -ENOMEM; } @@ -1842,8 +1859,7 @@ error: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; @@ -1878,17 +1894,24 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page { int ret; int force_cow = need_force_cow(inode, start, end); + const bool zoned = btrfs_is_zoned(inode->root->fs_info); if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2183,9 +2206,10 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + struct extent_map *em; u64 length = 0; u64 map_length; - int ret; + int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) @@ -2193,14 +2217,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, - &geom); + em = btrfs_get_chunk_map(fs_info, logical, map_length); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, + map_length, &geom); if (ret < 0) - return ret; + goto out; if (geom.len < length + size) - return 1; - return 0; + ret = 1; +out: + free_extent_map(em); + return ret; } /* @@ -2217,6 +2246,119 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_extent *ordered; + u64 len = bio->bi_iter.bi_size + size; + bool ret = true; + + ASSERT(btrfs_is_zoned(fs_info)); + ASSERT(fs_info->max_zone_append_size > 0); + ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); + + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + if (!ordered) + return ret; + + if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > + ordered->disk_bytenr + ordered->disk_num_bytes) + ret = false; + + btrfs_put_ordered_extent(ordered); + + return ret; +} + +static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + struct bio *bio, loff_t file_offset) +{ + struct btrfs_ordered_extent *ordered; + struct extent_map *em = NULL, *em_new = NULL; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bio->bi_iter.bi_size; + u64 end = start + len; + u64 ordered_end; + u64 pre, post; + int ret = 0; + + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + + /* No need to split */ + if (ordered->disk_num_bytes == len) + goto out; + + /* We cannot split once end_bio'd ordered extent */ + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { + ret = -EINVAL; + goto out; + } + + /* We cannot split a compressed ordered extent */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { + ret = -EINVAL; + goto out; + } + + ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; + /* bio must be in one ordered extent */ + if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { + ret = -EINVAL; + goto out; + } + + /* Checksum list should be empty */ + if (WARN_ON_ONCE(!list_empty(&ordered->list))) { + ret = -EINVAL; + goto out; + } + + pre = start - ordered->disk_bytenr; + post = ordered_end - end; + + ret = btrfs_split_ordered_extent(ordered, pre, post); + if (ret) + goto out; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, ordered->file_offset, len); + if (!em) { + read_unlock(&em_tree->lock); + ret = -EIO; + goto out; + } + read_unlock(&em_tree->lock); + + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + /* + * We cannot reuse em_new here but have to create a new one, as + * unpin_extent_cache() expects the start of the extent map to be the + * logical offset of the file, which does not hold true anymore after + * splitting. + */ + em_new = create_io_em(inode, em->start + pre, len, + em->start + pre, em->block_start + pre, len, + len, len, BTRFS_COMPRESS_NONE, + BTRFS_ORDERED_REGULAR); + if (IS_ERR(em_new)) { + ret = PTR_ERR(em_new); + goto out; + } + free_extent_map(em_new); + +out: + free_extent_map(em); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + /* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. @@ -2252,7 +2394,16 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (bio_op(bio) != REQ_OP_WRITE) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *page = bio_first_bvec_all(bio)->bv_page; + loff_t file_offset = page_offset(page); + + ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + if (ret) + goto out; + } + + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) goto out; @@ -2754,6 +2905,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + if (ordered_extent->disk) + btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { @@ -3103,14 +3257,16 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) } /** - * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running - * @fs_info - the fs_info for this fs - * @return - EINTR if we were killed, 0 if nothing's pending + * Wait for flushing all delayed iputs + * + * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. + * + * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { @@ -4720,6 +4876,9 @@ again: ret = -ENOMEM; goto out; } + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); @@ -4737,7 +4896,6 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, block_start, block_end, &cached_state); - set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { @@ -5011,6 +5169,15 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_wait_ordered_range(inode, + ALIGN(newsize, fs_info->sectorsize), + (u64)-1); + if (ret) + return ret; + } /* * We're truncating a file that used to have good data down to @@ -6371,7 +6538,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6435,7 +6602,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6579,7 +6746,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_fail; @@ -7103,9 +7270,6 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * - * This function will flush ordered extents in the range to ensure proper - * nocow checks for (nowait == false) case. - * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write @@ -7613,6 +7777,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; + if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + iomap->flags |= IOMAP_F_ZONE_APPEND; + free_extent_map(em); return 0; @@ -7682,7 +7849,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->logical_offset, dip->bytes, @@ -7797,10 +7964,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, NULL); btrfs_queue_work(wq, &ordered->work); } - /* - * If btrfs_dec_test_ordered_pending does not find any ordered - * extent in the range, we can exit. - */ + + /* No ordered extent found in the range, exit */ if (ordered_offset == last_offset) return; /* @@ -7841,6 +8006,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) dip->dio_bio->bi_status = err; + btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + bio_put(bio); btrfs_dio_private_put(dip); } @@ -7850,7 +8017,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = bio_op(bio) == REQ_OP_WRITE; + bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; /* Check btrfs_submit_bio_hook() for rules about async submit. */ @@ -7900,7 +8067,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); size_t dip_size; struct btrfs_dio_private *dip; @@ -7930,7 +8097,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); @@ -7941,10 +8108,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 submit_len; int clone_offset = 0; int clone_len; + u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; struct btrfs_dio_data *dio_data = iomap->private; + struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7973,12 +8142,18 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, submit_len = dio_bio->bi_iter.bi_size; do { - ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), - start_sector << 9, submit_len, - &geom); + logical = start_sector << 9; + em = btrfs_get_chunk_map(fs_info, logical, submit_len); + if (IS_ERR(em)) { + status = errno_to_blk_status(PTR_ERR(em)); + em = NULL; + goto out_err_em; + } + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), + logical, submit_len, &geom); if (ret) { status = errno_to_blk_status(ret); - goto out_err; + goto out_err_em; } ASSERT(geom.len <= INT_MAX); @@ -7993,6 +8168,19 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; + WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && + fs_info->max_zone_append_size && + bio_op(bio) != REQ_OP_ZONE_APPEND); + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + status = extract_ordered_extent(BTRFS_I(inode), bio, + file_offset); + if (status) { + bio_put(bio); + goto out_err; + } + } + ASSERT(submit_len >= clone_len); submit_len -= clone_len; @@ -8023,19 +8211,24 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio_put(bio); if (submit_len > 0) refcount_dec(&dip->refs); - goto out_err; + goto out_err_em; } dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; + + free_extent_map(em); } while (submit_len > 0); return BLK_QC_T_NONE; +out_err_em: + free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } @@ -8117,7 +8310,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) - detach_page_private(page); + clear_page_extent_mapped(page); return ret; } @@ -8186,8 +8379,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); -again: + start = page_start; +again: ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { found_ordered = true; @@ -8276,7 +8470,7 @@ again: } ClearPageChecked(page); - detach_page_private(page); + clear_page_extent_mapped(page); } /* @@ -8355,7 +8549,12 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, page_start, page_end, &cached_state); - set_page_extent_mapped(page); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } /* * we can't set the delalloc bits if there are pending ordered @@ -8592,15 +8791,18 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid) + struct btrfs_root *parent_root) { struct inode *inode; int err; u64 index = 0; + u64 ino; + + err = btrfs_get_free_objectid(new_root, &ino); + if (err < 0) + return err; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, - new_dirid, new_dirid, + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) @@ -9079,7 +9281,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) return ret; @@ -9486,11 +9688,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) return start_delalloc_inodes(root, &wbc, true, false); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { - .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, + .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, @@ -9507,12 +9709,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice) && nr) { + while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. */ - if (nr == U64_MAX) + if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, @@ -9575,7 +9777,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -9909,7 +10111,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dde49a791f3e..a8c60d46d19c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -528,6 +528,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, return -EPERM; /* + * btrfs_trim_block_group() depends on space cache, which is not + * available in zoned filesystem. So, disallow fitrim on a zoned + * filesystem for now. + */ + if (btrfs_is_zoned(fs_info)) + return -EOPNOTSUPP; + + /* * If the fs is mounted with nologreplay, which requires it to be * mounted in RO mode as well, we can not allow discard on free space * inside block groups, because log trees refer to extents that are not @@ -606,14 +614,13 @@ static noinline int create_subvol(struct inode *dir, int err; dev_t anon_dev = 0; u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) return -ENOMEM; - ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); + ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) goto fail_free; @@ -693,7 +700,7 @@ static noinline int create_subvol(struct inode *dir, free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(root_item, new_dirid); + btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; key.offset = 0; @@ -716,7 +723,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ @@ -724,10 +731,6 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - mutex_lock(&new_root->objectid_mutex); - new_root->highest_objectid = new_dirid; - mutex_unlock(&new_root->objectid_mutex); - /* * insert the directory item */ @@ -1319,6 +1322,13 @@ again: if (!page) break; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; while (1) { @@ -1440,7 +1450,6 @@ again: for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); ClearPageChecked(pages[i]); - set_page_extent_mapped(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); put_page(pages[i]); @@ -4951,7 +4960,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 79d366a36223..985a21558437 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,14 +199,21 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) - set_bit(type, &entry->flags); + entry->physical = (u64)-1; + entry->disk = NULL; + entry->partno = (u8)-1; - if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, - fs_info->delalloc_batch); + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC || + type == BTRFS_ORDERED_COMPRESSED); + set_bit(type, &entry->flags); + + percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, + fs_info->delalloc_batch); + + if (dio) set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - } /* one ref for the tree */ refcount_set(&entry->refs, 1); @@ -256,6 +263,9 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); @@ -265,6 +275,9 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); @@ -272,11 +285,12 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type) + u64 disk_num_bytes, int compress_type) { + ASSERT(compress_type != BTRFS_COMPRESS_NONE); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, + num_bytes, disk_num_bytes, + BTRFS_ORDERED_COMPRESSED, 0, compress_type); } @@ -297,26 +311,33 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * this is used to account for finished IO across a given range - * of the file. The IO may span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can + * contain several ordered extents. + * + * @found_ret: Return the finished ordered extent + * @file_offset: File offset for the finished IO + * Will also be updated to one byte past the range that is + * recordered as finished. This allows caller to walk forward. + * @io_size: Length of the finish IO range + * @uptodate: If the IO finished without problem * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * Return true if any ordered extent is finished in the range, and update + * @found_ret and @file_offset. + * Return false otherwise. * - * file_offset is updated to one byte past the range that is recorded as - * complete. This allows you to walk forward in the file. + * NOTE: Although The range can cross multiple ordered extents, only one + * ordered extent will be updated during one call. The caller is responsible to + * iterate all ordered extents in the range. */ -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - int ret; + bool finished = false; unsigned long flags; u64 dec_end; u64 dec_start; @@ -324,16 +345,12 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) { - ret = 1; + if (!offset_in_entry(entry, *file_offset)) goto out; - } dec_start = max(*file_offset, entry->file_offset); dec_end = min(*file_offset + io_size, @@ -354,39 +371,50 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { - *cached = entry; + if (finished && finished_ret && entry) { + *finished_ret = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* - * this is used to account for finished IO across a given range - * of the file. The IO should not span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can only + * contain one ordered extent. + * + * @cached: The cached ordered extent. If not NULL, we can skip the tree + * search and use the ordered extent directly. + * Will be also used to store the finished ordered extent. + * @file_offset: File offset for the finished IO + * @io_size: Length of the finish IO range + * @uptodate: If the IO finishes without problem * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * Return true if the ordered extent is finished in the range, and update + * @cached. + * Return false otherwise. + * + * NOTE: The range can NOT cross multiple ordered extents. + * Thus caller should ensure the range doesn't cross ordered extents. */ -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate) +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - int ret; + bool finished = false; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { @@ -395,41 +423,39 @@ int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, } node = tree_search(tree, file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) { - ret = 1; + if (!offset_in_entry(entry, file_offset)) goto out; - } - if (io_size > entry->bytes_left) { + if (io_size > entry->bytes_left) btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); - } + entry->bytes_left -= io_size; if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { + if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* @@ -480,9 +506,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, false); - if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, + fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); @@ -745,9 +770,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, file_offset); if (!node) goto out; @@ -758,7 +784,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino if (entry) refcount_inc(&entry->refs); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return entry; } @@ -898,6 +924,84 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, } } +static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, + u64 len) +{ + struct inode *inode = ordered->inode; + u64 file_offset = ordered->file_offset + pos; + u64 disk_bytenr = ordered->disk_bytenr + pos; + u64 num_bytes = len; + u64 disk_num_bytes = len; + int type; + unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); + int compress_type = ordered->compress_type; + unsigned long weight; + int ret; + + weight = hweight_long(flags_masked); + WARN_ON_ONCE(weight > 1); + if (!weight) + type = 0; + else + type = __ffs(flags_masked); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { + WARN_ON_ONCE(1); + ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), + file_offset, disk_bytenr, num_bytes, + disk_num_bytes, compress_type); + } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } else { + ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } + + return ret; +} + +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post) +{ + struct inode *inode = ordered->inode; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct rb_node *node; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret = 0; + + spin_lock_irq(&tree->lock); + /* Remove from tree once */ + node = &ordered->rb_node; + rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); + if (tree->last == node) + tree->last = NULL; + + ordered->file_offset += pre; + ordered->disk_bytenr += pre; + ordered->num_bytes -= (pre + post); + ordered->disk_num_bytes -= (pre + post); + ordered->bytes_left -= (pre + post); + + /* Re-insert the node */ + node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + ordered->file_offset); + + spin_unlock_irq(&tree->lock); + + if (pre) + ret = clone_ordered_extent(ordered, 0, pre); + if (post) + ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, + post); + + return ret; +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0bfa82b58e23..99e0853e4d3b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -27,7 +27,7 @@ struct btrfs_ordered_sum { }; /* - * bits for the flags field: + * Bits for btrfs_ordered_extent::flags. * * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. * It is used to make sure metadata is inserted into the tree only once @@ -38,24 +38,36 @@ struct btrfs_ordered_sum { * IO is done and any metadata is inserted into the tree. */ enum { + /* + * Different types for direct io, one and only one of the 4 type can + * be set when creating ordered extent. + * + * REGULAR: For regular non-compressed COW write + * NOCOW: For NOCOW write into existing non-hole extent + * PREALLOC: For NOCOW write into preallocated extent + * COMPRESSED: For compressed COW write + */ + BTRFS_ORDERED_REGULAR, + BTRFS_ORDERED_NOCOW, + BTRFS_ORDERED_PREALLOC, + BTRFS_ORDERED_COMPRESSED, + + /* + * Extra bit for direct io, can only be set for + * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. + */ + BTRFS_ORDERED_DIRECT, + + /* Extra status bits for ordered extents */ + /* set when all the pages are written */ BTRFS_ORDERED_IO_DONE, /* set when removed from the tree */ BTRFS_ORDERED_COMPLETE, - /* set when we want to write in place */ - BTRFS_ORDERED_NOCOW, - /* writing a zlib compressed extent */ - BTRFS_ORDERED_COMPRESSED, - /* set when writing to preallocated extent */ - BTRFS_ORDERED_PREALLOC, - /* set when we're doing DIO with this extent */ - BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, - /* Regular IO for COW */ - BTRFS_ORDERED_REGULAR, /* Used during fsync to track already logged extents */ BTRFS_ORDERED_LOGGED, /* We have already logged all the csums of the ordered extent */ @@ -127,6 +139,14 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; + + /* + * Used to reverse-map physical address returned from ZONE_APPEND write + * command in a workqueue context + */ + u64 physical; + struct gendisk *disk; + u8 partno; }; /* @@ -152,11 +172,11 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate); -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate); +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, @@ -167,8 +187,7 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type); + u64 disk_num_bytes, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, @@ -190,6 +209,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 93fbf87bdc8d..5394641541f7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -233,8 +233,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) - kvfree(x); + kvfree(x); return 0; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 4b9b6c52a83b..2b490becbe67 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -495,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, } static int process_leaf(struct btrfs_root *root, - struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret = 0; + int i = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); @@ -515,15 +516,15 @@ static int process_leaf(struct btrfs_root *root, case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, - &tree_block_level); + tree_block_level); break; case BTRFS_TREE_BLOCK_REF_KEY: ret = add_tree_block(fs_info, key.offset, 0, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_tree_block(fs_info, 0, key.offset, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = btrfs_item_ptr(leaf, i, @@ -549,7 +550,8 @@ static int process_leaf(struct btrfs_root *root, /* Walk down to the leaf from the given level */ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, - int level, u64 *bytenr, u64 *num_bytes) + int level, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct extent_buffer *eb; int ret = 0; @@ -565,7 +567,8 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK; } else { - ret = process_leaf(root, path, bytenr, num_bytes); + ret = process_leaf(root, path, bytenr, num_bytes, + tree_block_level); if (ret) break; } @@ -666,18 +669,18 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root; - u64 owner; - u64 offset; + u64 ref_root = 0; + u64 owner = 0; + u64 offset = 0; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; if (generic_ref->type == BTRFS_REF_METADATA) { - ref_root = generic_ref->tree_ref.root; + if (!parent) + ref_root = generic_ref->tree_ref.root; owner = generic_ref->tree_ref.level; - offset = 0; - } else { + } else if (!parent) { ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; @@ -693,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, goto out; } - if (parent) { - ref->parent = parent; - } else { - ref->root_objectid = ref_root; - ref->owner = owner; - ref->offset = offset; - } + ref->parent = parent; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; memcpy(&ra->ref, ref, sizeof(struct ref_entry)); @@ -974,6 +974,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; struct extent_buffer *eb; + int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -998,7 +999,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * different leaf from the original extent item. */ ret = walk_down_tree(fs_info->extent_root, path, level, - &bytenr, &num_bytes); + &bytenr, &num_bytes, &tree_block_level); if (ret) break; ret = walk_up_tree(path, &level); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b03e7891394e..b24396cf2f99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -81,7 +81,10 @@ static int copy_inline_to_page(struct btrfs_inode *inode, goto out_unlock; } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index df63ef64c5c0..232d5da7b7be 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -97,6 +97,7 @@ struct tree_block { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simple_node for search/insert */ + u64 owner; struct btrfs_key key; unsigned int level:8; unsigned int key_ready:1; @@ -668,9 +669,7 @@ static void __del_reloc_root(struct btrfs_root *root) RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); + ASSERT(!node || (struct btrfs_root *)node->data == root); } /* @@ -2393,8 +2392,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset, - block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, block->owner, + block->key.offset, block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2493,7 +2492,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + btrfs_readahead_tree_block(fs_info, block->bytenr, + block->owner, 0, block->level); } @@ -2553,6 +2553,31 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; + /* + * On a zoned filesystem, we cannot preallocate the file region. + * Instead, we dirty and fiemap_write the region. + */ + if (btrfs_is_zoned(inode->root->fs_info)) { + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + + end = cluster->end - offset + 1; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + i_size_write(&inode->vfs_inode, end); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + return btrfs_end_transaction(trans); + } + inode_lock(&inode->vfs_inode); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2615,7 +2640,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, /* * Allow error injection to test balance cancellation */ -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || fatal_signal_pending(current); @@ -2679,6 +2704,15 @@ static int relocate_file_extent_cluster(struct inode *inode, goto out; } } + ret = set_page_extent_mapped(page); + if (ret < 0) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + unlock_page(page); + put_page(page); + goto out; + } if (PageReadahead(page)) { page_cache_async_readahead(inode->i_mapping, @@ -2706,8 +2740,6 @@ static int relocate_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - set_page_extent_mapped(page); - if (nr < cluster->nr && page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, @@ -2749,6 +2781,8 @@ static int relocate_file_extent_cluster(struct inode *inode, } } WARN_ON(nr != cluster->nr); + if (btrfs_is_zoned(fs_info) && !ret) + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); out: kfree(ra); return ret; @@ -2801,21 +2835,58 @@ static int add_tree_block(struct reloc_control *rc, u32 item_size; int level = -1; u64 generation; + u64 owner = 0; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { + unsigned long ptr = 0, end; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + end = (unsigned long)ei + item_size; if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); level = btrfs_tree_block_level(eb, bi); + ptr = (unsigned long)(bi + 1); } else { level = (int)extent_key->offset; + ptr = (unsigned long)(ei + 1); } generation = btrfs_extent_generation(eb, ei); + + /* + * We're reading random blocks without knowing their owner ahead + * of time. This is ok most of the time, as all reloc roots and + * fs roots have the same lock type. However normal trees do + * not, and the only way to know ahead of time is to read the + * inline ref offset. We know it's an fs root if + * + * 1. There's more than one ref. + * 2. There's a SHARED_DATA_REF_KEY set. + * 3. FULL_BACKREF is set on the flags. + * + * Otherwise it's safe to assume that the ref offset == the + * owner of this block, so we can use that when calling + * read_tree_block. + */ + if (btrfs_extent_refs(eb, ei) == 1 && + !(btrfs_extent_flags(eb, ei) & + BTRFS_BLOCK_FLAG_FULL_BACKREF) && + ptr < end) { + struct btrfs_extent_inline_ref *iref; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; + if (type == BTRFS_TREE_BLOCK_REF_KEY) + owner = btrfs_extent_inline_ref_offset(eb, iref); + } } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { btrfs_print_v0_err(eb->fs_info); btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); @@ -2837,6 +2908,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.offset = generation; block->level = level; block->key_ready = 0; + block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) @@ -3389,8 +3461,12 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; + if (btrfs_is_zoned(trans->fs_info)) + flags &= ~BTRFS_INODE_PREALLOC; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3405,8 +3481,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, item, flags); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -3434,7 +3509,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, return ERR_CAST(trans); } - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f4f88a4d2c8..310fce00fcda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -166,6 +166,7 @@ struct scrub_ctx { int pages_per_rd_bio; int is_dev_replace; + u64 write_pointer; struct scrub_bio *wr_curr_bio; struct mutex wr_lock; @@ -856,6 +857,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) + return btrfs_repair_one_zone(fs_info, logical); + /* * We must use GFP_NOFS because the scrub task might be waiting for a * worker task executing this function and in turn a transaction commit @@ -1619,6 +1623,28 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, return scrub_add_page_to_wr_bio(sblock->sctx, spage); } +static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) +{ + int ret = 0; + u64 length; + + if (!btrfs_is_zoned(sctx->fs_info)) + return 0; + + if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) + return 0; + + if (sctx->write_pointer < physical) { + length = physical - sctx->write_pointer; + + ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, + sctx->write_pointer, length); + if (!ret) + sctx->write_pointer = physical; + } + return ret; +} + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { @@ -1641,6 +1667,13 @@ again: if (sbio->page_count == 0) { struct bio *bio; + ret = fill_writer_pointer_gap(sctx, + spage->physical_for_dev_replace); + if (ret) { + mutex_unlock(&sctx->wr_lock); + return ret; + } + sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; sbio->dev = sctx->wr_tgtdev; @@ -1702,6 +1735,9 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * doubled the write performance on spinning disks when measured * with Linux 3.5 */ btrfsic_submit_bio(sbio->bio); + + if (btrfs_is_zoned(sctx->fs_info)) + sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -3025,6 +3061,46 @@ out: return ret < 0 ? ret : 0; } +static void sync_replace_for_zoned(struct scrub_ctx *sctx) +{ + if (!btrfs_is_zoned(sctx->fs_info)) + return; + + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); +} + +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -3165,6 +3241,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ blk_start_plug(&plug); + if (sctx->is_dev_replace && + btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { + mutex_lock(&sctx->wr_lock); + sctx->write_pointer = physical; + mutex_unlock(&sctx->wr_lock); + sctx->flush_all_writes = true; + } + /* * now find all extents for each stripe and scrub them */ @@ -3353,6 +3437,9 @@ again: if (ret) goto out; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3420,6 +3507,17 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); btrfs_free_path(ppath); + + if (sctx->is_dev_replace && ret >= 0) { + int ret2; + + ret2 = sync_write_pointer_for_zoned(sctx, base + offset, + map->stripes[num].physical, + physical_end); + if (ret2) + ret = ret2; + } + return ret < 0 ? ret : 0; } @@ -3475,6 +3573,25 @@ out: return ret; } +static int finish_extent_writes_for_zoned(struct btrfs_root *root, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans); +} + static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) @@ -3561,6 +3678,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { + spin_lock(&cache->lock); + if (!cache->to_copy) { + spin_unlock(&cache->lock); + ro_set = 0; + goto done; + } + spin_unlock(&cache->lock); + } + /* * Make sure that while we are scrubbing the corresponding block * group doesn't get its logical address and its device extents @@ -3619,6 +3746,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * group is not RO. */ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); + if (!ret && sctx->is_dev_replace) { + ret = finish_extent_writes_for_zoned(root, cache); + if (ret) { + btrfs_dec_block_group_ro(cache); + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + if (ret == 0) { ro_set = 1; } else if (ret == -ENOSPC && !sctx->is_dev_replace) { @@ -3692,6 +3829,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + if (sctx->is_dev_replace && + !btrfs_finish_block_group_to_copy(dev_replace->srcdev, + cache, found_key.offset)) + ro_set = 0; + +done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 78a35374d492..f87878274e9f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1191,9 +1191,6 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - /* data offset in the file extent item */ - u64 data_offset; - /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1401,19 +1398,6 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; - /* - * For non-compressed extents iterate_extent_inodes() gives us extent - * offsets that already take into account the data offset, but not for - * compressed extents, since the offset is logical and not relative to - * the physical extent locations. We must take this into account to - * avoid sending clone offsets that go beyond the source file's size, - * which would result in the clone ioctl failing with -EINVAL on the - * receiving end. - */ - if (compressed == BTRFS_COMPRESS_NONE) - backref_ctx->data_offset = 0; - else - backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -6607,10 +6591,9 @@ static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, - void *ctx) + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || @@ -6815,7 +6798,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, void *ctx) + struct btrfs_root *right_root, struct send_ctx *sctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6967,7 +6950,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; } @@ -6978,7 +6961,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; } @@ -6992,7 +6975,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; advance_left = ADVANCE; @@ -7000,7 +6983,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; advance_right = ADVANCE; @@ -7015,7 +6998,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, else result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, - &left_key, result, ctx); + &left_key, result, sctx); if (ret < 0) goto out; advance_left = ADVANCE; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e8347461c8dd..2da6177f4b0b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -140,6 +140,12 @@ * be freed, plus any delayed work we may not have gotten rid of in the case * of metadata. * + * FORCE_COMMIT_TRANS + * For use by the preemptive flusher. We use this to bypass the ticketing + * checks in may_commit_transaction, as we have more information about the + * overall state of the system and may want to commit the transaction ahead + * of actual ENOSPC conditions. + * * OVERCOMMIT * * Because we hold so many reservations for metadata we will allow you to @@ -163,6 +169,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, ASSERT(s_info); return s_info->bytes_used + s_info->bytes_reserved + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + (may_use_included ? s_info->bytes_may_use : 0); } @@ -206,6 +213,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->ro_bgs); INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -257,7 +265,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -273,6 +281,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->bytes_readonly += bytes_readonly; + found->bytes_zone_unusable += bytes_zone_unusable; if (total_bytes > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -422,10 +431,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, info->total_bytes - btrfs_space_info_used(info, true), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); + info->bytes_readonly, info->bytes_zone_unusable); DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); @@ -454,9 +463,10 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); + cache->reserved, cache->zone_unusable, + cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); } @@ -489,7 +499,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, { struct btrfs_trans_handle *trans; u64 delalloc_bytes; - u64 dio_bytes; + u64 ordered_bytes; u64 items; long time_left; int loops; @@ -513,26 +523,22 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) return; - } /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (dio_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes) wait_ordered = true; loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { - u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + while ((delalloc_bytes || ordered_bytes) && loops < 3) { + u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + long nr_pages = min_t(u64, temp, LONG_MAX); btrfs_start_delalloc_roots(fs_info, nr_pages, true); @@ -555,15 +561,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + ordered_bytes = percpu_counter_sum_positive( + &fs_info->ordered_bytes); } } /** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit + * Possibly commit the transaction if its ok to + * + * @fs_info: the filesystem + * @space_info: space_info we are checking for commit, either data or metadata * * This will check to make sure that committing the transaction will actually * get us somewhere and then commit the transaction if it does. Otherwise it @@ -669,7 +676,7 @@ enospc: */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - int state) + enum btrfs_flush_state state, bool for_preempt) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -738,13 +745,21 @@ static void flush_space(struct btrfs_fs_info *fs_info, case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info); break; + case FORCE_COMMIT_TRANS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_commit_transaction(trans); + break; default: ret = -ENOSPC; break; } trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); + ret, for_preempt); return; } @@ -754,7 +769,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; - u64 expected; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -772,43 +786,88 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, if (space_info->total_bytes + avail < used) to_reclaim += used - (space_info->total_bytes + avail); - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used) +static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) { + u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); + u64 used; /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; + return false; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) - return 0; + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + + /* + * If we have over half of the free space occupied by reservations or + * pinned then we want to start flushing. + * + * We do not do the traditional thing here, which is to say + * + * if (used >= ((total_bytes + avail) / 2)) + * return 1; + * + * because this doesn't quite work how we want. If we had more than 50% + * of the space_info used by bytes_used and we had 0 available we'd just + * constantly run the background flusher. Instead we want it to kick in + * if our reclaimable space exceeds our clamped free space. + * + * Our clamping range is 2^1 -> 2^8. Practically speaking that means + * the following: + * + * Amount of RAM Minimum threshold Maximum threshold + * + * 256GiB 1GiB 128GiB + * 128GiB 512MiB 64GiB + * 64GiB 256MiB 32GiB + * 32GiB 128MiB 16GiB + * 16GiB 64MiB 8GiB + * + * These are the range our thresholds will fall in, corresponding to how + * much delalloc we need for the background flusher to kick in. + */ + + thresh = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + thresh += (space_info->total_bytes - space_info->bytes_used - + space_info->bytes_reserved - space_info->bytes_readonly); + thresh >>= space_info->clamp; + + used = space_info->bytes_pinned; + + /* + * If we have more ordered bytes than delalloc bytes then we're either + * doing a lot of DIO, or we simply don't have a lot of delalloc waiting + * around. Preemptive flushing is only useful in that it can free up + * space before tickets need to wait for things to finish. In the case + * of ordered extents, preemptively waiting on ordered extents gets us + * nothing, if our reservations are tied up in ordered extents we'll + * simply have to slow down writers by forcing them to wait on ordered + * extents. + * + * In the case that ordered is larger than delalloc, only include the + * block reserves that we would actually be able to directly reclaim + * from. In this case if we're heavy on metadata operations this will + * clearly be heavy enough to warrant preemptive flushing. In the case + * of heavy DIO or ordered reservations, preemptive flushing will just + * waste time and cause us to slow down. + */ + ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + if (ordered >= delalloc) + used += fs_info->delayed_refs_rsv.reserved + + fs_info->delayed_block_rsv.reserved; + else + used += space_info->bytes_may_use; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -922,7 +981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; - int flush_state; + enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; @@ -941,7 +1000,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -990,6 +1049,105 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * This handles pre-flushing of metadata space before we get to the point that + * we need to start blocking threads on tickets. The logic here is different + * from the other flush paths because it doesn't rely on tickets to tell us how + * much we need to flush, instead it attempts to keep us below the 80% full + * watermark of space by flushing whichever reservation pool is currently the + * largest. + */ +static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv; + struct btrfs_block_rsv *trans_rsv; + int loops = 0; + + fs_info = container_of(work, struct btrfs_fs_info, + preempt_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + delayed_block_rsv = &fs_info->delayed_block_rsv; + delayed_refs_rsv = &fs_info->delayed_refs_rsv; + global_rsv = &fs_info->global_block_rsv; + trans_rsv = &fs_info->trans_block_rsv; + + spin_lock(&space_info->lock); + while (need_preemptive_reclaim(fs_info, space_info)) { + enum btrfs_flush_state flush; + u64 delalloc_size = 0; + u64 to_reclaim, block_rsv_size; + u64 global_rsv_size = global_rsv->reserved; + + loops++; + + /* + * We don't have a precise counter for the metadata being + * reserved for delalloc, so we'll approximate it by subtracting + * out the block rsv's space from the bytes_may_use. If that + * amount is higher than the individual reserves, then we can + * assume it's tied up in delalloc reservations. + */ + block_rsv_size = global_rsv_size + + delayed_block_rsv->reserved + + delayed_refs_rsv->reserved + + trans_rsv->reserved; + if (block_rsv_size < space_info->bytes_may_use) + delalloc_size = space_info->bytes_may_use - block_rsv_size; + spin_unlock(&space_info->lock); + + /* + * We don't want to include the global_rsv in our calculation, + * because that's space we can't touch. Subtract it from the + * block_rsv_size for the next checks. + */ + block_rsv_size -= global_rsv_size; + + /* + * We really want to avoid flushing delalloc too much, as it + * could result in poor allocation patterns, so only flush it if + * it's larger than the rest of the pools combined. + */ + if (delalloc_size > block_rsv_size) { + to_reclaim = delalloc_size; + flush = FLUSH_DELALLOC; + } else if (space_info->bytes_pinned > + (delayed_block_rsv->reserved + + delayed_refs_rsv->reserved)) { + to_reclaim = space_info->bytes_pinned; + flush = FORCE_COMMIT_TRANS; + } else if (delayed_block_rsv->reserved > + delayed_refs_rsv->reserved) { + to_reclaim = delayed_block_rsv->reserved; + flush = FLUSH_DELAYED_ITEMS_NR; + } else { + to_reclaim = delayed_refs_rsv->reserved; + flush = FLUSH_DELAYED_REFS_NR; + } + + /* + * We don't want to reclaim everything, just a portion, so scale + * down the to_reclaim by 1/4. If it takes us down to 0, + * reclaim 1 items worth. + */ + to_reclaim >>= 2; + if (!to_reclaim) + to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); + flush_space(fs_info, space_info, to_reclaim, flush, true); + cond_resched(); + spin_lock(&space_info->lock); + } + + /* We only went through once, back off our clamping. */ + if (loops == 1 && !space_info->reclaim_size) + space_info->clamp = max(1, space_info->clamp - 1); + trace_btrfs_done_preemptive_reclaim(fs_info, space_info); + spin_unlock(&space_info->lock); +} + +/* * FLUSH_DELALLOC_WAIT: * Space is freed from flushing delalloc in one of two ways. * @@ -1054,7 +1212,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 last_tickets_id; - int flush_state = 0; + enum btrfs_flush_state flush_state = 0; fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); space_info = fs_info->data_sinfo; @@ -1069,7 +1227,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1082,7 +1240,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) while (flush_state < ARRAY_SIZE(data_flush_states)) { flush_space(fs_info, space_info, U64_MAX, - data_flush_states[flush_state]); + data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1115,6 +1273,8 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); + INIT_WORK(&fs_info->preempt_reclaim_work, + btrfs_preempt_reclaim_metadata_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1153,7 +1313,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, states[flush_state]); + flush_space(fs_info, space_info, to_reclaim, states[flush_state], + false); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -1169,7 +1330,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket) { while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); @@ -1214,11 +1375,14 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, } /** - * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket - * @fs_info - the fs - * @space_info - the space_info for the reservation - * @ticket - the ticket for the reservation - * @flush - how much we can flush + * Do the appropriate flushing and waiting for a ticket + * + * @fs_info: the filesystem + * @space_info: space info for the reservation + * @ticket: ticket for the reservation + * @start_ns: timestamp when the reservation started + * @orig_bytes: amount of bytes originally reserved + * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. @@ -1226,6 +1390,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, + u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; @@ -1281,6 +1446,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * space wasn't reserved at all). */ ASSERT(!(ticket->bytes == 0 && ticket->error)); + trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, + start_ns, flush, ticket->error); return ret; } @@ -1294,12 +1461,31 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } +static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + + /* + * If we're heavy on ordered operations then clamping won't help us. We + * need to clamp specifically to keep up with dirty'ing buffered + * writers, because there's not a 1:1 correlation of writing delalloc + * and freeing space, like there is with flushing delayed refs or + * delayed nodes. If we're already more ordered than delalloc then + * we're keeping up, otherwise we aren't and should probably clamp. + */ + if (ordered < delalloc) + space_info->clamp = min(space_info->clamp + 1, 8); +} + /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Try to reserve bytes from the block_rsv's space + * + * @fs_info: the filesystem + * @space_info: space info we want to allocate from + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1314,6 +1500,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, { struct work_struct *async_work; struct reserve_ticket ticket; + u64 start_ns = 0; u64 used; int ret = 0; bool pending_tickets; @@ -1366,6 +1553,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + if (trace_btrfs_reserve_ticket_enabled()) + start_ns = ktime_get_ns(); + if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || flush == BTRFS_RESERVE_FLUSH_DATA) { @@ -1382,6 +1572,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } + + /* + * We were forced to add a reserve ticket, so our preemptive + * flushing is unable to keep up. Clamp down on the threshold + * for the preemptive flushing in order to keep up with the + * workload. + */ + maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -1390,27 +1588,29 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, used) && - !work_busy(&fs_info->async_reclaim_work)) { + need_preemptive_reclaim(fs_info, space_info) && + !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + &fs_info->preempt_reclaim_work); } } spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, flush); + return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, + orig_bytes, flush); } /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Trye to reserve metadata bytes from the block_rsv's space + * + * @root: the root we're allocating for + * @block_rsv: block_rsv we're allocating for + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1448,10 +1648,11 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } /** - * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation - * @fs_info - the filesystem - * @bytes - the number of bytes we need - * @flush - how we are allowed to flush + * Try to reserve data bytes for an allocation + * + * @fs_info: the filesystem + * @bytes: number of bytes we need + * @flush: how we are allowed to flush * * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5646393b928c..b1a8ffb03b3e 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -17,11 +17,17 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + int clamp; /* Used to scale our threshold for preemptive + flushing. The value is >> clamp, so turns + out to be a 2^clamp divisor. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ @@ -119,7 +125,7 @@ DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); @@ -152,4 +158,21 @@ static inline void btrfs_space_info_free_bytes_may_use( int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +static inline void __btrfs_mod_total_bytes_pinned( + struct btrfs_space_info *space_info, + s64 mod) +{ + percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, + BTRFS_TOTAL_BYTES_PINNED_BATCH); +} + +static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, + u64 flags, s64 mod) +{ + struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); + + ASSERT(space_info); + __btrfs_mod_total_bytes_pinned(space_info, mod); +} + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c new file mode 100644 index 000000000000..c69049e7daa9 --- /dev/null +++ b/fs/btrfs/subpage.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/slab.h> +#include "ctree.h" +#include "subpage.h" + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type) +{ + struct btrfs_subpage *subpage = NULL; + int ret; + + /* + * We have cases like a dummy extent buffer page, which is not mappped + * and doesn't need to be locked. + */ + if (page->mapping) + ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ + if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + return 0; + + ret = btrfs_alloc_subpage(fs_info, &subpage, type); + if (ret < 0) + return ret; + attach_page_private(page, subpage); + return 0; +} + +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + /* Either not subpage, or already detached */ + if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + return; + + subpage = (struct btrfs_subpage *)detach_page_private(page); + ASSERT(subpage); + btrfs_free_subpage(subpage); +} + +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type) +{ + if (fs_info->sectorsize == PAGE_SIZE) + return 0; + + *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); + if (!*ret) + return -ENOMEM; + spin_lock_init(&(*ret)->lock); + if (type == BTRFS_SUBPAGE_METADATA) + atomic_set(&(*ret)->eb_refs, 0); + else + atomic_set(&(*ret)->readers, 0); + return 0; +} + +void btrfs_free_subpage(struct btrfs_subpage *subpage) +{ + kfree(subpage); +} + +/* + * Increase the eb_refs of current subpage. + * + * This is important for eb allocation, to prevent race with last eb freeing + * of the same page. + * With the eb_refs increased before the eb inserted into radix tree, + * detach_extent_buffer_page() won't detach the page private while we're still + * allocating the extent buffer. + */ +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + atomic_inc(&subpage->eb_refs); +} + +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + ASSERT(atomic_read(&subpage->eb_refs)); + atomic_dec(&subpage->eb_refs); +} + +static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + /* Basic checks */ + ASSERT(PagePrivate(page) && page->private); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + /* + * The range check only works for mapped page, we can still have + * unmapped page like dummy extent buffer pages. + */ + if (page->mapping) + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); +} + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ret = atomic_add_return(nbits, &subpage->readers); + ASSERT(ret == nbits); +} + +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + ASSERT(atomic_read(&subpage->readers) >= nbits); + if (atomic_sub_and_test(nbits, &subpage->readers)) + unlock_page(page); +} + +/* + * Convert the [start, start + len) range into a u16 bitmap + * + * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. + */ +static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + + /* + * Here nbits can be 16, thus can go beyond u16 range. We make the + * first left shift to be calculate in unsigned long (at least u32), + * then truncate the result to u16. + */ + return (u16)(((1UL << nbits) - 1) << bit_start); +} + +void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap |= tmp; + if (subpage->uptodate_bitmap == U16_MAX) + SetPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap &= ~tmp; + ClearPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap |= tmp; + SetPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap &= ~tmp; + if (subpage->error_bitmap == 0) + ClearPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +/* + * Unlike set/clear which is dependent on each page status, for test all bits + * are tested in the same way. + */ +#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned long flags; \ + bool ret; \ + \ + spin_lock_irqsave(&subpage->lock, flags); \ + ret = ((subpage->name##_bitmap & tmp) == tmp); \ + spin_unlock_irqrestore(&subpage->lock, flags); \ + return ret; \ +} +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); + +/* + * Note that, in selftests (extent-io-tests), we can have empty fs_info passed + * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall + * back to regular sectorsize branch. + */ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ + test_page_func) \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, + PageUptodate); +IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h new file mode 100644 index 000000000000..b86a4881475d --- /dev/null +++ b/fs/btrfs/subpage.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUBPAGE_H +#define BTRFS_SUBPAGE_H + +#include <linux/spinlock.h> + +/* + * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap + * is sufficient. Regular bitmap_* is not used due to size reasons. + */ +#define BTRFS_SUBPAGE_BITMAP_SIZE 16 + +/* + * Structure to trace status of each sector inside a page, attached to + * page::private for both data and metadata inodes. + */ +struct btrfs_subpage { + /* Common members for both data and metadata pages */ + spinlock_t lock; + u16 uptodate_bitmap; + u16 error_bitmap; + union { + /* + * Structures only used by metadata + * + * @eb_refs should only be operated under private_lock, as it + * manages whether the subpage can be detached. + */ + atomic_t eb_refs; + /* Structures only used by data */ + struct { + atomic_t readers; + }; + }; +}; + +enum btrfs_subpage_type { + BTRFS_SUBPAGE_METADATA, + BTRFS_SUBPAGE_DATA, +}; + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page); + +/* Allocate additional data where page represents more than one sector */ +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type); +void btrfs_free_subpage(struct btrfs_subpage *subpage); + +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + +/* + * Template for subpage related operations. + * + * btrfs_subpage_*() are for call sites where the page has subpage attached and + * the range is ensured to be inside the page. + * + * btrfs_page_*() are for call sites where the page can either be subpage + * specific or regular page. The function will handle both cases. + * But the range still needs to be inside the page. + */ +#define DECLARE_BTRFS_SUBPAGE_OPS(name) \ +void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); + +DECLARE_BTRFS_SUBPAGE_OPS(uptodate); +DECLARE_BTRFS_SUBPAGE_OPS(error); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12d7d3be7cd4..f8435641b912 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,7 +48,6 @@ #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" - #include "qgroup.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -2028,6 +2027,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto restore; } + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_warn(fs_info, + "read-write mount is not yet allowed for sectorsize %u page size %lu", + fs_info->sectorsize, PAGE_SIZE); + ret = -EINVAL; + goto restore; + } /* * NOTE: when remounting with a change that does writes, don't diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 19b9fffa2c9c..6eb1c50fa98c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -666,6 +666,7 @@ SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); +SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(space_info, total_bytes_pinned, @@ -679,6 +680,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, total_bytes_pinned), diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 57379e96ccc9..c0aefe6dee0b 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -507,7 +507,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6af7f2bf92de..acff6bb49a97 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -107,6 +108,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), + [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | @@ -375,6 +381,8 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); + INIT_LIST_HEAD(&cur_trans->releasing_ebs); + spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -826,10 +834,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) return trans; } -/* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_transaction *commit) +/* Wait for a transaction commit to reach at least the given state. */ +static noinline void wait_for_commit(struct btrfs_transaction *commit, + const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); + wait_event(commit->commit_wait, commit->state >= min_state); } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -884,7 +893,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) goto out; /* nothing committing|committed */ } - wait_for_commit(cur_trans); + wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); btrfs_put_transaction(cur_trans); out: return ret; @@ -909,9 +918,8 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || - cur_trans->delayed_refs.flushing) + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; return should_end_transaction(trans); @@ -1230,10 +1238,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; - ret = btrfs_run_dev_stats(trans); if (ret) return ret; @@ -1248,10 +1252,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; @@ -1266,15 +1266,24 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; } + /* Now flush any delayed refs generated by updating all of the roots */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; + + /* + * We're writing the dirty block groups, which could generate + * delayed refs, which could generate more dirty block groups, + * so we want to keep this flushing in this loop to make sure + * everything gets run. + */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; @@ -1319,7 +1328,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *gang[8]; int i; int ret; - int err = 0; spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1331,6 +1339,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; + int ret2; + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1350,17 +1360,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) root->node); } - err = btrfs_update_root(trans, fs_info->tree_root, + ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret2) + return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - if (err) - break; btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); - return err; + return 0; } /* @@ -1433,6 +1443,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, record_root_in_trans(trans, src, 1); /* + * btrfs_qgroup_inherit relies on a consistent view of the usage for the + * src root, so we must run the delayed refs here. + * + * However this isn't particularly fool proof, because there's no + * synchronization keeping us from changing the tree after this point + * before we do the qgroup_inherit, or even from making changes while + * we're doing the qgroup_inherit. But that's a problem for the future, + * for now flush the delayed refs to narrow the race window where the + * qgroup counters could end up wrong. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + /* * We are going to commit transaction, see btrfs_commit_transaction() * comment for reason locking tree_log_mutex */ @@ -1525,7 +1552,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; - pending->error = btrfs_find_free_objectid(tree_root, &objectid); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto no_free_objectid; @@ -1685,12 +1712,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. @@ -1738,12 +1759,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: @@ -2042,32 +2057,25 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - - cur_trans = trans->transaction; - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. + * We only want one transaction commit doing the flushing so we do not + * waste a bunch of time on lock contention on the extent root node. */ - cur_trans->delayed_refs.flushing = 1; - smp_wmb(); + if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, + &cur_trans->delayed_refs.flags)) { + /* + * Make a pass through all the delayed refs we have so far. + * Any running threads may add more while we are here. + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + } btrfs_create_pending_block_groups(trans); - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; @@ -2101,11 +2109,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans); - wait_for_commit(cur_trans); + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + ret = btrfs_end_transaction(trans); + wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; @@ -2119,13 +2131,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); if (cur_trans->list.prev != &fs_info->trans_list) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (prev_trans->state != TRANS_STATE_COMPLETED) { + if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); - wait_for_commit(prev_trans); + wait_for_commit(prev_trans, want_state); + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); @@ -2335,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } + /* + * At this point, we should have written all the tree blocks allocated + * in this transaction. So it's now safe to free the redirtyied extent + * buffers. + */ + btrfs_free_redirty_list(cur_trans); + ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers @@ -2344,6 +2369,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_SUPER_COMMITTED; + wake_up(&cur_trans->commit_wait); + btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 31ca81bad822..6335716e513f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -16,6 +16,7 @@ enum btrfs_trans_state { TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, + TRANS_STATE_SUPER_COMMITTED, TRANS_STATE_COMPLETED, TRANS_STATE_MAX, }; @@ -92,6 +93,9 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; + + spinlock_t releasing_ebs_lock; + struct list_head releasing_ebs; }; #define __TRANS_FREEZABLE (1U << 0) @@ -133,6 +137,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool dirty; + bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 254c2ee43aae..d90695c1ab6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -104,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that @@ -139,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); int ret = 0; + bool created = false; /* * First check if the log root tree was already created. If not, create @@ -149,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, fs_info); - if (!ret) + if (!ret) { set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } } mutex_unlock(&tree_root->log_mutex); if (ret) @@ -159,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; @@ -172,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -200,14 +225,22 @@ out: */ static int join_running_log_trans(struct btrfs_root *root) { + const bool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT; if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret; mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); @@ -2752,6 +2785,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); return ret; } + btrfs_redirty_list_add( + trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3085,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); + /* + * -EAGAIN happens when someone, e.g., a concurrent transaction + * commit, writes a dirty extent in this tree-log commit. This + * concurrent write will create a hole writing out the extents, + * and we cannot proceed on a zoned filesystem, requiring + * sequential writing. While we can bail out to a full commit + * here, but we can continue hoping the concurrent writing fills + * the hole. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) + ret = 0; if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); @@ -3127,6 +3173,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; + if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + } + mutex_unlock(&fs_info->tree_root->log_mutex); + } + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit @@ -3194,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); - if (ret) { + /* + * As described above, -EAGAIN indicates a hole in the extents. We + * cannot wait for these write outs since the waiting cause a + * deadlock. Bail out to the full commit instead. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { + btrfs_set_log_full_commit(trans); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } else if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -3285,17 +3354,22 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - if (ret) { - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(log->fs_info, ret, NULL); + if (log->node) { + ret = walk_log_tree(trans, log, &wc); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } } clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); + + if (trans && log->node) + btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } @@ -3379,7 +3453,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; int err = 0; - int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) @@ -3406,7 +3479,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; @@ -3421,46 +3493,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; } } - /* update the directory size in the log to reflect the names - * we have removed + /* + * We do not need to update the size field of the directory's inode item + * because on log replay we update the field to reflect all existing + * entries in the directory (see overwrite_item()). */ - if (bytes_del) { - struct btrfs_key key; - - key.objectid = dir_ino; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(path); - - ret = btrfs_search_slot(trans, log, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 0) { - struct btrfs_inode_item *item; - u64 i_size; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - i_size = btrfs_inode_size(path->nodes[0], item); - if (i_size > bytes_del) - i_size -= bytes_del; - else - i_size = 0; - btrfs_set_inode_size(path->nodes[0], item, i_size); - btrfs_mark_buffer_dirty(path->nodes[0]); - } else - ret = 0; - btrfs_release_path(path); - } fail: btrfs_free_path(path); out_unlock: @@ -3889,7 +3932,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_timespec_nsec(&token, &item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait + * for ordered extent completion, which is where we update nbytes, it + * only waits for writeback to complete. During log replay as we find + * file extent items and replay them, we adjust the nbytes field of the + * inode item in subvolume tree as needed (see overwrite_item()). + */ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); @@ -5290,12 +5340,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * This is for cases where logging a directory could result in losing a + * a file after replaying the log. For example, if we move a file from a + * directory A to a directory B, then fsync directory A, we have no way + * to known the file was moved from A to B, so logging just A would + * result in losing the file after a log replay. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && + inode_only == LOG_INODE_ALL && + inode->last_unlink_trans >= trans->transid) { + btrfs_set_log_full_commit(trans); + err = 1; + goto out_unlock; + } + + /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); @@ -5452,96 +5518,31 @@ out_unlock: } /* - * Check if we must fallback to a transaction commit when logging an inode. - * This must be called after logging the inode and is used only in the context - * when fsyncing an inode requires the need to log some other inode - in which - * case we can't lock the i_mutex of each other inode we need to log as that - * can lead to deadlocks with concurrent fsync against other inodes (as we can - * log inodes up or down in the hierarchy) or rename operations for example. So - * we take the log_mutex of the inode after we have logged it and then check for - * its last_unlink_trans value - this is safe because any task setting - * last_unlink_trans must take the log_mutex and it must do this before it does - * the actual unlink operation, so if we do this check before a concurrent task - * sets last_unlink_trans it means we've logged a consistent version/state of - * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task might have only updated last_unlink_trans before - * we logged the inode or it might have also done the unlink). + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. */ -static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static bool need_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { - bool ret = false; - - mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans >= trans->transid) { - /* - * Make sure any commits to the log are forced to be full - * commits. - */ - btrfs_set_log_full_commit(trans); - ret = true; - } - mutex_unlock(&inode->log_mutex); - - return ret; -} - -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, - struct dentry *parent, - struct super_block *sb) -{ - int ret = 0; - struct dentry *old_parent = NULL; - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. */ - if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation < trans->transid && - inode->last_unlink_trans < trans->transid) - goto out; - - if (!S_ISDIR(inode->vfs_inode.i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - goto out; - inode = BTRFS_I(d_inode(parent)); - } - - while (1) { - if (btrfs_must_commit_transaction(trans, inode)) { - ret = 1; - break; - } - - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - if (IS_ROOT(parent)) { - inode = BTRFS_I(d_inode(parent)); - if (btrfs_must_commit_transaction(trans, inode)) - ret = 1; - break; - } - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - inode = BTRFS_I(d_inode(parent)); + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; - } - dput(old_parent); -out: - return ret; + return true; } struct btrfs_dir_list { @@ -5671,7 +5672,7 @@ process_leaf: goto next_dir_inode; } - if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { + if (!need_log_inode(trans, BTRFS_I(di_inode))) { btrfs_add_delayed_iput(di_inode); break; } @@ -5681,9 +5682,6 @@ process_leaf: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) - ret = 1; btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; @@ -5821,13 +5819,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } + if (!need_log_inode(trans, BTRFS_I(dir_inode))) { + btrfs_add_delayed_iput(dir_inode); + continue; + } + if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) - ret = 1; if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); @@ -5872,7 +5872,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid) + if (BTRFS_I(inode)->generation >= trans->transid && + need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5926,7 +5927,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation >= trans->transid) { + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) @@ -6041,12 +6043,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct super_block *sb; int ret = 0; bool log_dentries = false; - sb = inode->vfs_inode.i_sb; - if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; @@ -6057,10 +6056,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb); - if (ret) - goto end_no_trans; - /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they @@ -6307,8 +6302,7 @@ again: * root->objectid_mutex is not acquired as log replay * could only happen during mount. */ - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); } wc.replay_dest->log_root = NULL; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6c24c8ad749..b8fab44394f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -669,10 +669,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; - ret = btrfs_get_dev_zone_info(device); - if (ret != 0) - goto error_free_page; - fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1418,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * make sure to start at an offset of at least 1MB. */ return max_t(u64, start, SZ_1M); + case BTRFS_CHUNK_ALLOC_ZONED: + /* + * We don't care about the starting region like regular + * allocator, because we anyway use/reserve the first two zones + * for superblock logging. + */ + return ALIGN(start, device->zone_info->zone_size); default: BUG(); } } +static bool dev_extent_hole_check_zoned(struct btrfs_device *device, + u64 *hole_start, u64 *hole_size, + u64 num_bytes) +{ + u64 zone_size = device->zone_info->zone_size; + u64 pos; + int ret; + bool changed = false; + + ASSERT(IS_ALIGNED(*hole_start, zone_size)); + + while (*hole_size > 0) { + pos = btrfs_find_allocatable_zones(device, *hole_start, + *hole_start + *hole_size, + num_bytes); + if (pos != *hole_start) { + *hole_size = *hole_start + *hole_size - pos; + *hole_start = pos; + changed = true; + if (*hole_size < num_bytes) + break; + } + + ret = btrfs_ensure_empty_zones(device, pos, num_bytes); + + /* Range is ensured to be empty */ + if (!ret) + return changed; + + /* Given hole range was invalid (outside of device) */ + if (ret == -ERANGE) { + *hole_start += *hole_size; + *hole_size = 0; + return 1; + } + + *hole_start += zone_size; + *hole_size -= zone_size; + changed = true; + } + + return changed; +} + /** * dev_extent_hole_check - check if specified hole is suitable for allocation * @device: the device which we have the hole @@ -1430,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * - * This function may modify @hole_start and @hole_end to reflect the suitable + * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, @@ -1439,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, bool changed = false; u64 hole_end = *hole_start + *hole_size; - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + for (;;) { + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, + hole_size, num_bytes)) { + changed = true; + /* + * The changed hole can contain pending extent. + * Loop again to check that. + */ + continue; + } + break; + default: + BUG(); + } - switch (device->fs_devices->chunk_alloc_policy) { - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ break; - default: - BUG(); } return changed; @@ -1509,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device, search_start = dev_extent_search_start(device, search_start); + WARN_ON(device->zone_info && + !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4668,11 +4733,10 @@ again: } ret = btrfs_previous_item(root, path, 0, key.type); - if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret < 0) - goto done; if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + if (ret < 0) + goto done; ret = 0; btrfs_release_path(path); break; @@ -4904,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } +static void init_alloc_chunk_ctl_policy_zoned( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 zone_size = fs_devices->fs_info->zone_size; + u64 limit; + int min_num_stripes = ctl->devs_min * ctl->dev_stripes; + int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + u64 type = ctl->type; + + ctl->max_stripe_size = zone_size; + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, + zone_size); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } + + /* We don't want a chunk larger than 10% of writable space */ + limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + zone_size), + min_chunk_size); + ctl->max_chunk_size = min(limit, ctl->max_chunk_size); + ctl->dev_extent_min = zone_size * ctl->dev_stripes; +} + static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { @@ -4924,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; + case BTRFS_CHUNK_ALLOC_ZONED: + init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); + break; default: BUG(); } @@ -5050,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, return 0; } +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + u64 zone_size = devices_info[0].dev->zone_info->zone_size; + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * It should hold because: + * dev_extent_min == dev_extent_want == zone_size * dev_stripes + */ + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + + ctl->stripe_size = zone_size; + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, + ctl->stripe_size) + ctl->nparity, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + } + + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) @@ -5077,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); + case BTRFS_CHUNK_ALLOC_ZONED: + return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } @@ -5841,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, return ret; } +static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + bool ret; + + /* Non zoned filesystem does not use "to_copy" flag */ + if (!btrfs_is_zoned(fs_info)) + return false; + + cache = btrfs_lookup_block_group(fs_info, logical); + + spin_lock(&cache->lock); + ret = cache->to_copy; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + return ret; +} + static void handle_ops_on_dev_replace(enum btrfs_map_op op, struct btrfs_bio **bbio_ret, struct btrfs_dev_replace *dev_replace, + u64 logical, int *num_stripes_ret, int *max_errors_ret) { struct btrfs_bio *bbio = *bbio_ret; @@ -5857,6 +6009,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, int index_where_to_add; /* + * A block group which have "to_copy" set will eventually + * copied by dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; + + /* * duplicate the write operations while the dev replace * procedure is running. Since the copying of the old disk to * the new disk takes place at run time while the filesystem is @@ -5941,23 +6100,24 @@ static bool need_full_stripe(enum btrfs_map_op op) } /* - * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) - * tuple. This information is used to calculate how big a - * particular bio can get before it straddles a stripe. + * Calculate the geometry of a particular (address, len) tuple. This + * information is used to calculate how big a particular bio can get before it + * straddles a stripe. * - * @fs_info - the filesystem - * @logical - address that we want to figure out the geometry of - * @len - the length of IO we are going to perform, starting at @logical - * @op - type of operation - write or read - * @io_geom - pointer used to return values + * @fs_info: the filesystem + * @em: mapping containing the logical extent + * @op: type of operation - write or read + * @logical: address that we want to figure out the geometry of + * @len: the length of IO we are going to perform, starting at @logical + * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom) { - struct extent_map *em; struct map_lookup *map; u64 offset; u64 stripe_offset; @@ -5965,14 +6125,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; - int ret = 0; ASSERT(op != BTRFS_MAP_DISCARD); - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) - return PTR_ERR(em); - map = em->map_lookup; /* Offset of this logical address in the chunk */ offset = logical - em->start; @@ -5986,8 +6141,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, btrfs_crit(fs_info, "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", stripe_offset, offset, em->start, logical, stripe_len); - ret = -EINVAL; - goto out; + return -EINVAL; } /* stripe_offset is the offset of this block in its stripe */ @@ -6034,10 +6188,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom->stripe_offset = stripe_offset; io_geom->raid56_stripe_offset = raid56_full_stripe_start; -out: - /* once for us */ - free_extent_map(em); - return ret; + return 0; } static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -6070,12 +6221,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ASSERT(bbio_ret); ASSERT(op != BTRFS_MAP_DISCARD); - ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); + + ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); if (ret < 0) return ret; - em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; @@ -6251,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, - &max_errors); + handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + &num_stripes, &max_errors); } *bbio_ret = bbio; @@ -6323,7 +6475,7 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_device *dev = btrfs_io_bio(bio)->device; ASSERT(dev->bdev); - if (bio_op(bio) == REQ_OP_WRITE) + if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) @@ -6375,6 +6527,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, @@ -6436,10 +6602,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { + ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { ret = raid56_parity_write(fs_info, bio, bbio, map_length); } else { @@ -6462,7 +6628,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (bio_op(first_bio) == REQ_OP_WRITE && + (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; @@ -7644,6 +7810,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + if (dev->zone_info) { + u64 zone_size = dev->zone_info->zone_size; + + if (!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size)) { + btrfs_err(fs_info, +"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", + devid, physical_offset, physical_len); + ret = -EUCLEAN; + goto out; + } + } + out: free_extent_map(em); return ret; @@ -7800,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } + +static int relocating_repair_kthread(void *data) +{ + struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 target; + int ret = 0; + + target = cache->start; + btrfs_put_block_group(cache); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + btrfs_info(fs_info, + "zoned: skip relocating block group %llu to repair: EBUSY", + target); + return -EBUSY; + } + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Ensure block group still exists */ + cache = btrfs_lookup_block_group(fs_info, target); + if (!cache) + goto out; + + if (!cache->relocating_repair) + goto out; + + ret = btrfs_may_alloc_data_chunk(fs_info, target); + if (ret < 0) + goto out; + + btrfs_info(fs_info, + "zoned: relocating block group %llu to repair IO failure", + target); + ret = btrfs_relocate_chunk(fs_info, target); + +out: + if (cache) + btrfs_put_block_group(cache); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_exclop_finish(fs_info); + + return ret; +} + +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + + /* Do not attempt to repair in degraded state */ + if (btrfs_test_opt(fs_info, DEGRADED)) + return 0; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return 0; + + spin_lock(&cache->lock); + if (cache->relocating_repair) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + return 0; + } + cache->relocating_repair = 1; + spin_unlock(&cache->lock); + + kthread_run(relocating_repair_kthread, cache, + "btrfs-relocating-repair"); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c43663d9c22e..d4c3e0dd32b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -214,6 +214,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_REGULAR, + BTRFS_CHUNK_ALLOC_ZONED, }; /* @@ -423,6 +424,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) case REQ_OP_DISCARD: return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); @@ -440,8 +442,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); @@ -596,5 +599,6 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c38846659019..9a5cf153da89 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1,14 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/bitops.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" +#include "disk-io.h" +#include "block-group.h" +#include "transaction.h" +#include "dev-replace.h" +#include "space-info.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 +/* Invalid allocation pointer value for missing devices */ +#define WP_MISSING_DEV ((u64)-1) +/* Pseudo write pointer value for conventional zone */ +#define WP_CONVENTIONAL ((u64)-2) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 @@ -119,6 +130,36 @@ static inline u32 sb_zone_number(int shift, int mirror) return 0; } +/* + * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block + * device into static sized chunks and fake a conventional zone on each of + * them. + */ +static int emulate_report_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int nr_zones) +{ + const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(device->bdev); + unsigned int i; + + pos >>= SECTOR_SHIFT; + for (i = 0; i < nr_zones; i++) { + zones[i].start = i * zone_sectors + pos; + zones[i].len = zone_sectors; + zones[i].capacity = zone_sectors; + zones[i].wp = zones[i].start + zone_sectors; + zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; + zones[i].cond = BLK_ZONE_COND_NOT_WP; + + if (zones[i].wp >= bdev_size) { + i++; + break; + } + } + + return i; +} + static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { @@ -127,6 +168,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!*nr_zones) return 0; + if (!bdev_is_zoned(device->bdev)) { + ret = emulate_report_zones(device, pos, zones, *nr_zones); + *nr_zones = ret; + return 0; + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -143,8 +190,78 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } +/* The emulated zone size is determined from the size of device extent */ +static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_dev_extent *dext; + int ret = 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); + ret = 0; + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + /* We can skip reading of zone info for missing devices */ + if (!device->bdev) + continue; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + break; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + int btrfs_get_dev_zone_info(struct btrfs_device *device) { + struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; struct request_queue *queue = bdev_get_queue(bdev); @@ -153,9 +270,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; unsigned int zone_sectors; + char *model, *emulated; int ret; - if (!bdev_is_zoned(bdev)) + /* + * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not + * yet be set. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; if (device->zone_info) @@ -165,8 +287,20 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + if (!bdev_is_zoned(bdev)) { + if (!fs_info->zone_size) { + ret = calculate_emulated_zone_size(fs_info); + if (ret) + goto out; + } + + ASSERT(fs_info->zone_size); + zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; + } else { + zone_sectors = bdev_zone_sectors(bdev); + } + nr_sectors = bdev_nr_sectors(bdev); - zone_sectors = bdev_zone_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; @@ -272,20 +406,42 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) device->zone_info = zone_info; - /* device->fs_info is not safe to use for printing messages */ - btrfs_info_in_rcu(NULL, - "host-%s zoned block device %s, %u zones of %llu bytes", - bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", - rcu_str_deref(device->name), zone_info->nr_zones, - zone_info->zone_size); + switch (bdev_zoned_model(bdev)) { + case BLK_ZONED_HM: + model = "host-managed zoned"; + emulated = ""; + break; + case BLK_ZONED_HA: + model = "host-aware zoned"; + emulated = ""; + break; + case BLK_ZONED_NONE: + model = "regular"; + emulated = "emulated "; + break; + default: + /* Just in case */ + btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", + bdev_zoned_model(bdev), + rcu_str_deref(device->name)); + ret = -EOPNOTSUPP; + goto out_free_zone_info; + } + + btrfs_info_in_rcu(fs_info, + "%s block device %s, %u %szones of %llu bytes", + model, rcu_str_deref(device->name), zone_info->nr_zones, + emulated, zone_info->zone_size); return 0; out: kfree(zones); +out_free_zone_info: bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->seq_zones); kfree(zone_info); + device->zone_info = NULL; return ret; } @@ -324,7 +480,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 nr_devices = 0; u64 zone_size = 0; u64 max_zone_append_size = 0; - const bool incompat_zoned = btrfs_is_zoned(fs_info); + const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; /* Count zoned devices */ @@ -335,9 +491,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) continue; model = bdev_zoned_model(device->bdev); + /* + * A Host-Managed zoned device must be used as a zoned device. + * A Host-Aware zoned device and a non-zoned devices can be + * treated as a zoned device, if ZONED flag is enabled in the + * superblock. + */ if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info; + (model == BLK_ZONED_HA && incompat_zoned) || + (model == BLK_ZONED_NONE && incompat_zoned)) { + struct btrfs_zoned_device_info *zone_info = + device->zone_info; zone_info = device->zone_info; zoned_devices++; @@ -406,6 +570,15 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + + /* + * Check mount options here, because we might change fs_info->zoned + * from fs_info->zone_size. + */ + ret = btrfs_check_mountopts_zoned(fs_info); + if (ret) + goto out; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); out: @@ -488,7 +661,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, unsigned int zone_sectors; u32 sb_zone; int ret; - u64 zone_size; u8 zone_sectors_shift; sector_t nr_sectors; u32 nr_zones; @@ -503,7 +675,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) return -EINVAL; - zone_size = zone_sectors << SECTOR_SHIFT; zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; @@ -529,7 +700,13 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, struct btrfs_zoned_device_info *zinfo = device->zone_info; u32 zone_num; - if (!zinfo) { + /* + * For a zoned filesystem on a non-zoned block device, use the same + * super block locations as regular filesystem. Doing so, the super + * block can always be retrieved and the zoned flag of the volume + * detected from the super block information. + */ + if (!bdev_is_zoned(device->bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } @@ -614,3 +791,671 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) sb_zone << zone_sectors_shift, zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } + +/** + * btrfs_find_allocatable_zones - find allocatable zones within a given region + * + * @device: the device to allocate a region on + * @hole_start: the position of the hole to allocate the region + * @num_bytes: size of wanted region + * @hole_end: the end of the hole + * @return: position of allocatable zones + * + * Allocatable region should not contain any superblock locations. + */ +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + u64 nzones = num_bytes >> shift; + u64 pos = hole_start; + u64 begin, end; + bool have_sb; + int i; + + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + + while (pos < hole_end) { + begin = pos >> shift; + end = begin + nzones; + + if (end > zinfo->nr_zones) + return hole_end; + + /* Check if zones in the region are all empty */ + if (btrfs_dev_is_sequential(device, pos) && + find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + pos += zinfo->zone_size; + continue; + } + + have_sb = false; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_pos; + + sb_zone = sb_zone_number(shift, i); + if (!(end <= sb_zone || + sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { + have_sb = true; + pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + break; + } + + /* We also need to exclude regular superblock positions */ + sb_pos = btrfs_sb_offset(i); + if (!(pos + num_bytes <= sb_pos || + sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { + have_sb = true; + pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, + zinfo->zone_size); + break; + } + } + if (!have_sb) + break; + } + + return pos; +} + +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes) +{ + int ret; + + *bytes = 0; + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, + GFP_NOFS); + if (ret) + return ret; + + *bytes = length; + while (length) { + btrfs_dev_set_zone_empty(device, physical); + physical += device->zone_info->zone_size; + length -= device->zone_info->zone_size; + } + + return 0; +} + +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + unsigned long begin = start >> shift; + unsigned long end = (start + size) >> shift; + u64 pos; + int ret; + + ASSERT(IS_ALIGNED(start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + + if (end > zinfo->nr_zones) + return -ERANGE; + + /* All the zones are conventional */ + if (find_next_bit(zinfo->seq_zones, begin, end) == end) + return 0; + + /* All the zones are sequential and empty */ + if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && + find_next_zero_bit(zinfo->empty_zones, begin, end) == end) + return 0; + + for (pos = start; pos < start + size; pos += zinfo->zone_size) { + u64 reset_bytes; + + if (!btrfs_dev_is_sequential(device, pos) || + btrfs_dev_is_empty_zone(device, pos)) + continue; + + /* Free regions should be empty */ + btrfs_warn_in_rcu( + device->fs_info, + "zoned: resetting device %s (devid %llu) zone %llu for allocation", + rcu_str_deref(device->name), device->devid, pos >> shift); + WARN_ON_ONCE(1); + + ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, + &reset_bytes); + if (ret) + return ret; + } + + return 0; +} + +/* + * Calculate an allocation pointer from the extent allocation information + * for a block group consist of conventional zones. It is pointed to the + * end of the highest addressed extent in the block group as an allocation + * offset. + */ +static int calculate_alloc_pointer(struct btrfs_block_group *cache, + u64 *offset_ret) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + u64 length; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = cache->start + cache->length; + key.type = 0; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + /* We should not find the exact match */ + if (!ret) + ret = -EUCLEAN; + if (ret < 0) + goto out; + + ret = btrfs_previous_extent_item(root, path, cache->start); + if (ret) { + if (ret == 1) { + ret = 0; + *offset_ret = 0; + } + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY) + length = found_key.offset; + else + length = fs_info->nodesize; + + if (!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length)) { + ret = -EUCLEAN; + goto out; + } + *offset_ret = found_key.objectid + length - cache->start; + ret = 0; + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *device; + u64 logical = cache->start; + u64 length = cache->length; + u64 physical = 0; + int ret; + int i; + unsigned int nofs_flag; + u64 *alloc_offsets = NULL; + u64 last_alloc = 0; + u32 num_sequential = 0, num_conventional = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + /* Sanity check */ + if (!IS_ALIGNED(length, fs_info->zone_size)) { + btrfs_err(fs_info, + "zoned: block group %llu len %llu unaligned to zone size %llu", + logical, length, fs_info->zone_size); + return -EIO; + } + + /* Get the chunk mapping */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) + return -EINVAL; + + map = em->map_lookup; + + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); + if (!alloc_offsets) { + free_extent_map(em); + return -ENOMEM; + } + + for (i = 0; i < map->num_stripes; i++) { + bool is_sequential; + struct blk_zone zone; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->bdev == NULL) { + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } + + is_sequential = btrfs_dev_is_sequential(device, physical); + if (is_sequential) + num_sequential++; + else + num_conventional++; + + if (!is_sequential) { + alloc_offsets[i] = WP_CONVENTIONAL; + continue; + } + + /* + * This zone will be used for allocation, so mark this zone + * non-empty. + */ + btrfs_dev_clear_zone_empty(device, physical); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + up_read(&dev_replace->rwsem); + + /* + * The group is mapped to a sequential zone. Get the zone write + * pointer to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret == -EIO || ret == -EOPNOTSUPP) { + ret = 0; + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } else if (ret) { + goto out; + } + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + physical >> device->zone_info->zone_size_shift, + rcu_str_deref(device->name), device->devid); + alloc_offsets[i] = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + alloc_offsets[i] = 0; + break; + case BLK_ZONE_COND_FULL: + alloc_offsets[i] = fs_info->zone_size; + break; + default: + /* Partially used zone */ + alloc_offsets[i] = + ((zone.wp - zone.start) << SECTOR_SHIFT); + break; + } + } + + if (num_sequential > 0) + cache->seq_zone = true; + + if (num_conventional > 0) { + /* + * Avoid calling calculate_alloc_pointer() for new BG. It + * is no use for new BG. It must be always 0. + * + * Also, we have a lock chain of extent buffer lock -> + * chunk mutex. For new BG, this function is called from + * btrfs_make_block_group() which is already taking the + * chunk mutex. Thus, we cannot call + * calculate_alloc_pointer() which takes extent buffer + * locks to avoid deadlock. + */ + if (new) { + cache->alloc_offset = 0; + goto out; + } + ret = calculate_alloc_pointer(cache, &last_alloc); + if (ret || map->num_stripes == num_conventional) { + if (!ret) + cache->alloc_offset = last_alloc; + else + btrfs_err(fs_info, + "zoned: failed to determine allocation offset of bg %llu", + cache->start); + goto out; + } + } + + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + cache->alloc_offset = alloc_offsets[0]; + break; + case BTRFS_BLOCK_GROUP_DUP: + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID0: + case BTRFS_BLOCK_GROUP_RAID10: + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + /* non-single profiles are not supported yet */ + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + ret = -EINVAL; + goto out; + } + +out: + /* An extent is allocated after the write pointer */ + if (!ret && num_conventional && last_alloc > cache->alloc_offset) { + btrfs_err(fs_info, + "zoned: got wrong write pointer in BG %llu: %llu > %llu", + logical, last_alloc, cache->alloc_offset); + ret = -EIO; + } + + if (!ret) + cache->meta_write_pointer = cache->alloc_offset + cache->start; + + kfree(alloc_offsets); + free_extent_map(em); + + return ret; +} + +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) +{ + u64 unusable, free; + + if (!btrfs_is_zoned(cache->fs_info)) + return; + + WARN_ON(cache->bytes_super != 0); + unusable = cache->alloc_offset - cache->used; + free = cache->length - cache->alloc_offset; + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->free_space_ctl->free_space = free; + cache->zone_unusable = unusable; + + /* Should not have any excluded extents. Just in case, though */ + btrfs_free_excluded_extents(cache); +} + +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (!btrfs_is_zoned(fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || + !list_empty(&eb->release_list)) + return; + + set_extent_buffer_dirty(eb); + set_extent_bits_nowait(&trans->dirty_pages, eb->start, + eb->start + eb->len - 1, EXTENT_DIRTY); + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); + + spin_lock(&trans->releasing_ebs_lock); + list_add_tail(&eb->release_list, &trans->releasing_ebs); + spin_unlock(&trans->releasing_ebs_lock); + atomic_inc(&eb->refs); +} + +void btrfs_free_redirty_list(struct btrfs_transaction *trans) +{ + spin_lock(&trans->releasing_ebs_lock); + while (!list_empty(&trans->releasing_ebs)) { + struct extent_buffer *eb; + + eb = list_first_entry(&trans->releasing_ebs, + struct extent_buffer, release_list); + list_del_init(&eb->release_list); + free_extent_buffer(eb); + } + spin_unlock(&trans->releasing_ebs_lock); +} + +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_group *cache; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return false; + + if (!fs_info->max_zone_append_size) + return false; + + if (!is_data_inode(&inode->vfs_inode)) + return false; + + cache = btrfs_lookup_block_group(fs_info, em->block_start); + ASSERT(cache); + if (!cache) + return false; + + ret = cache->seq_zone; + btrfs_put_block_group(cache); + + return ret; +} + +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio) +{ + struct btrfs_ordered_extent *ordered; + const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (bio_op(bio) != REQ_OP_ZONE_APPEND) + return; + + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + if (WARN_ON(!ordered)) + return; + + ordered->physical = physical; + ordered->disk = bio->bi_disk; + ordered->partno = bio->bi_partno; + + btrfs_put_ordered_extent(ordered); +} + +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_ordered_sum *sum; + struct block_device *bdev; + u64 orig_logical = ordered->disk_bytenr; + u64 *logical = NULL; + int nr, stripe_len; + + /* Zoned devices should not have partitions. So, we can assume it is 0 */ + ASSERT(ordered->partno == 0); + bdev = bdgrab(ordered->disk->part0); + if (WARN_ON(!bdev)) + return; + + if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, + ordered->physical, &logical, &nr, + &stripe_len))) + goto out; + + WARN_ON(nr != 1); + + if (orig_logical == *logical) + goto out; + + ordered->disk_bytenr = *logical; + + em_tree = &inode->extent_tree; + write_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); + em->block_start = *logical; + free_extent_map(em); + write_unlock(&em_tree->lock); + + list_for_each_entry(sum, &ordered->list, list) { + if (*logical < orig_logical) + sum->bytenr -= orig_logical - *logical; + else + sum->bytenr += *logical - orig_logical; + } + +out: + kfree(logical); + bdput(bdev); +} + +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + struct btrfs_block_group *cache; + bool ret = true; + + if (!btrfs_is_zoned(fs_info)) + return true; + + cache = *cache_ret; + + if (cache && (eb->start < cache->start || + cache->start + cache->length <= eb->start)) { + btrfs_put_block_group(cache); + cache = NULL; + *cache_ret = NULL; + } + + if (!cache) + cache = btrfs_lookup_block_group(fs_info, eb->start); + + if (cache) { + if (cache->meta_write_pointer != eb->start) { + btrfs_put_block_group(cache); + cache = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; + } + + *cache_ret = cache; + } + + return ret; +} + +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ + if (!btrfs_is_zoned(eb->fs_info) || !cache) + return; + + ASSERT(cache->meta_write_pointer == eb->start + eb->len); + cache->meta_write_pointer = eb->start; +} + +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) +{ + if (!btrfs_dev_is_sequential(device, physical)) + return -EOPNOTSUPP; + + return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, + length >> SECTOR_SHIFT, GFP_NOFS, 0); +} + +static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, + struct blk_zone *zone) +{ + struct btrfs_bio *bbio = NULL; + u64 mapped_length = PAGE_SIZE; + unsigned int nofs_flag; + int nmirrors; + int i, ret; + + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bbio); + if (ret || !bbio || mapped_length < PAGE_SIZE) { + btrfs_put_bbio(bbio); + return -EIO; + } + + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + return -EINVAL; + + nofs_flag = memalloc_nofs_save(); + nmirrors = (int)bbio->num_stripes; + for (i = 0; i < nmirrors; i++) { + u64 physical = bbio->stripes[i].physical; + struct btrfs_device *dev = bbio->stripes[i].dev; + + /* Missing device */ + if (!dev->bdev) + continue; + + ret = btrfs_get_dev_zone(dev, physical, zone); + /* Failing device */ + if (ret == -EIO || ret == -EOPNOTSUPP) + continue; + break; + } + memalloc_nofs_restore(nofs_flag); + + return ret; +} + +/* + * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by + * filling zeros between @physical_pos to a write pointer of dev-replace + * source device. + */ +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos) +{ + struct btrfs_fs_info *fs_info = tgt_dev->fs_info; + struct blk_zone zone; + u64 length; + u64 wp; + int ret; + + if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) + return 0; + + ret = read_zone_info(fs_info, logical, &zone); + if (ret) + return ret; + + wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); + + if (physical_pos == wp) + return 0; + + if (physical_pos > wp) + return -EUCLEAN; + + length = wp - physical_pos; + return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8abe2f83272b..61e969652fe1 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -7,6 +7,7 @@ #include <linux/blkdev.h> #include "volumes.h" #include "disk-io.h" +#include "block-group.h" struct btrfs_zoned_device_info { /* @@ -25,6 +26,7 @@ struct btrfs_zoned_device_info { #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); @@ -35,6 +37,28 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes); +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes); +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb); +void btrfs_free_redirty_list(struct btrfs_transaction *trans); +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio); +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret); +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb); +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -42,6 +66,11 @@ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + return 0; +} + static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) { return 0; @@ -85,6 +114,78 @@ static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror return 0; } +static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device, + u64 hole_start, u64 hole_end, + u64 num_bytes) +{ + return hole_start; +} + +static inline int btrfs_reset_device_zone(struct btrfs_device *device, + u64 physical, u64 length, u64 *bytes) +{ + *bytes = 0; + return 0; +} + +static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, + u64 start, u64 size) +{ + return 0; +} + +static inline int btrfs_load_block_group_zone_info( + struct btrfs_block_group *cache, bool new) +{ + return 0; +} + +static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } + +static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) { } +static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } + +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, + struct extent_map *em) +{ + return false; +} + +static inline void btrfs_record_physical_zoned(struct inode *inode, + u64 file_offset, struct bio *bio) +{ +} + +static inline void btrfs_rewrite_logical_zoned( + struct btrfs_ordered_extent *ordered) { } + +static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + return true; +} + +static inline void btrfs_revert_meta_write_pointer( + struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ +} + +static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, + u64 physical, u64 length) +{ + return -EOPNOTSUPP; +} + +static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, + u64 logical, u64 physical_start, + u64 physical_pos) +{ + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -136,12 +237,16 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, struct block_device *bdev) { - u64 zone_size; - if (btrfs_is_zoned(fs_info)) { - zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; - /* Do not allow non-zoned device */ - return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + /* + * We can allow a regular device on a zoned filesystem, because + * we will emulate the zoned capabilities. + */ + if (!bdev_is_zoned(bdev)) + return true; + + return fs_info->zone_size == + (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } /* Do not allow Host Manged zoned device */ @@ -157,4 +262,46 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 p return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); } +static inline bool btrfs_can_zone_reset(struct btrfs_device *device, + u64 physical, u64 length) +{ + u64 zone_size; + + if (!btrfs_dev_is_sequential(device, physical)) + return false; + + zone_size = device->zone_info->zone_size; + if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size)) + return false; + + return true; +} + +static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_lock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_unlock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if (!btrfs_is_zoned(fs_info)) + return; + + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg == bg->start) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); +} + #endif diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 933f234d5bec..2273120d8ed7 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -201,6 +201,34 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, iomap_dio_submit_bio(dio, iomap, bio, pos); } +/* + * Figure out the bio's operation flags from the dio request, the + * mapping, and whether or not we want FUA. Note that we can end up + * clearing the WRITE_FUA flag in the dio request. + */ +static inline unsigned int +iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) +{ + unsigned int opflags = REQ_SYNC | REQ_IDLE; + + if (!(dio->flags & IOMAP_DIO_WRITE)) { + WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + return REQ_OP_READ; + } + + if (iomap->flags & IOMAP_F_ZONE_APPEND) + opflags |= REQ_OP_ZONE_APPEND; + else + opflags |= REQ_OP_WRITE; + + if (use_fua) + opflags |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; + + return opflags; +} + static loff_t iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) @@ -208,6 +236,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); + unsigned int bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; @@ -263,6 +292,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, iomap_dio_zero(dio, iomap, pos - pad, pad); } + /* + * Set the operation flags early so that bio_iov_iter_get_pages + * can set up the page vector appropriately for a ZONE_APPEND + * operation. + */ + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + do { size_t n; if (dio->error) { @@ -278,6 +314,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + bio->bi_opf = bio_opf; ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { @@ -293,14 +330,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - if (use_fua) - bio->bi_opf |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; task_io_account_write(n); } else { - bio->bi_opf = REQ_OP_READ; if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } |