diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/backref.c | 85 | ||||
-rw-r--r-- | fs/btrfs/bio.c | 2 | ||||
-rw-r--r-- | fs/btrfs/block-group.c | 86 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
-rw-r--r-- | fs/btrfs/discard.c | 21 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 14 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 7 | ||||
-rw-r--r-- | fs/btrfs/file.c | 11 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 8 | ||||
-rw-r--r-- | fs/btrfs/fs.h | 7 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 7 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 3 | ||||
-rw-r--r-- | fs/btrfs/qgroup.c | 11 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 42 | ||||
-rw-r--r-- | fs/btrfs/space-info.h | 2 | ||||
-rw-r--r-- | fs/btrfs/super.c | 4 | ||||
-rw-r--r-- | fs/btrfs/sysfs.c | 42 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 15 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 23 | ||||
-rw-r--r-- | fs/btrfs/zoned.c | 45 |
20 files changed, 296 insertions, 141 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 90e40d5ceccd..e54f0884802a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, level = -1; ULIST_ITER_INIT(&uiter); while (1) { - bool is_shared; - bool cached; + const unsigned long prev_ref_count = ctx->refs.nnodes; walk_ctx.bytenr = bytenr; ret = find_parent_nodes(&walk_ctx, &shared); @@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ret = 0; /* - * If our data extent was not directly shared (without multiple - * reference items), than it might have a single reference item - * with a count > 1 for the same offset, which means there are 2 - * (or more) file extent items that point to the data extent - - * this happens when a file extent item needs to be split and - * then one item gets moved to another leaf due to a b+tree leaf - * split when inserting some item. In this case the file extent - * items may be located in different leaves and therefore some - * of the leaves may be referenced through shared subtrees while - * others are not. Since our extent buffer cache only works for - * a single path (by far the most common case and simpler to - * deal with), we can not use it if we have multiple leaves - * (which implies multiple paths). + * More than one extent buffer (bytenr) may have been added to + * the ctx->refs ulist, in which case we have to check multiple + * tree paths in case the first one is not shared, so we can not + * use the path cache which is made for a single path. Multiple + * extent buffers at the current level happen when: + * + * 1) level -1, the data extent: If our data extent was not + * directly shared (without multiple reference items), then + * it might have a single reference item with a count > 1 for + * the same offset, which means there are 2 (or more) file + * extent items that point to the data extent - this happens + * when a file extent item needs to be split and then one + * item gets moved to another leaf due to a b+tree leaf split + * when inserting some item. In this case the file extent + * items may be located in different leaves and therefore + * some of the leaves may be referenced through shared + * subtrees while others are not. Since our extent buffer + * cache only works for a single path (by far the most common + * case and simpler to deal with), we can not use it if we + * have multiple leaves (which implies multiple paths). + * + * 2) level >= 0, a tree node/leaf: We can have a mix of direct + * and indirect references on a b+tree node/leaf, so we have + * to check multiple paths, and the extent buffer (the + * current bytenr) may be shared or not. One example is + * during relocation as we may get a shared tree block ref + * (direct ref) and a non-shared tree block ref (indirect + * ref) for the same node/leaf. */ - if (level == -1 && ctx->refs.nnodes > 1) + if ((ctx->refs.nnodes - prev_ref_count) > 1) ctx->use_path_cache = false; if (level >= 0) @@ -1964,12 +1978,17 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, if (!node) break; bytenr = node->val; - level++; - cached = lookup_backref_shared_cache(ctx, root, bytenr, level, - &is_shared); - if (cached) { - ret = (is_shared ? 1 : 0); - break; + if (ctx->use_path_cache) { + bool is_shared; + bool cached; + + level++; + cached = lookup_backref_shared_cache(ctx, root, bytenr, + level, &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } } shared.share_count = 0; shared.have_delayed_delete_refs = false; @@ -1977,6 +1996,28 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, } /* + * If the path cache is disabled, then it means at some tree level we + * got multiple parents due to a mix of direct and indirect backrefs or + * multiple leaves with file extent items pointing to the same data + * extent. We have to invalidate the cache and cache only the sharedness + * result for the levels where we got only one node/reference. + */ + if (!ctx->use_path_cache) { + int i = 0; + + level--; + if (ret >= 0 && level >= 0) { + bytenr = ctx->path_cache_entries[level].bytenr; + ctx->use_path_cache = true; + store_backref_shared_cache(ctx, root, bytenr, level, ret); + i = level + 1; + } + + for ( ; i < BTRFS_MAX_LEVEL; i++) + ctx->path_cache_entries[i].bytenr = 0; + } + + /* * Cache the sharedness result for the data extent if we know our inode * has more than 1 file extent item that refers to the data extent. */ diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d8b90f95b157..726592868e9c 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -287,7 +287,7 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - if (!(bio->bi_opf & REQ_RAHEAD)) + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5b10401d803b..5fc670c27f86 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -558,14 +558,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group, int index, int max_index, - struct btrfs_key *key) + struct btrfs_key *found_key) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - int ret = 0; u64 search_offset; u64 search_end = block_group->start + block_group->length; struct btrfs_path *path; + struct btrfs_key search_key; + int ret = 0; ASSERT(index >= 0); ASSERT(index <= max_index); @@ -585,37 +586,24 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); - key->objectid = block_group->start + search_offset; - key->type = BTRFS_EXTENT_ITEM_KEY; - key->offset = 0; + search_key.objectid = block_group->start + search_offset; + search_key.type = BTRFS_EXTENT_ITEM_KEY; + search_key.offset = 0; - while (1) { - ret = btrfs_search_forward(extent_root, key, path, 0); - if (ret != 0) - goto out; + btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { /* Success; sampled an extent item in the block group */ - if (key->type == BTRFS_EXTENT_ITEM_KEY && - key->objectid >= block_group->start && - key->objectid + key->offset <= search_end) - goto out; + if (found_key->type == BTRFS_EXTENT_ITEM_KEY && + found_key->objectid >= block_group->start && + found_key->objectid + found_key->offset <= search_end) + break; /* We can't possibly find a valid extent item anymore */ - if (key->objectid >= search_end) { + if (found_key->objectid >= search_end) { ret = 1; break; } - if (key->type < BTRFS_EXTENT_ITEM_KEY) - key->type = BTRFS_EXTENT_ITEM_KEY; - else - key->objectid++; - btrfs_release_path(path); - up_read(&fs_info->commit_root_sem); - mutex_unlock(&caching_ctl->mutex); - cond_resched(); - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); } -out: + lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -659,6 +647,7 @@ out: static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; u64 min_size = block_group->length; @@ -668,6 +657,8 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl if (!btrfs_block_group_should_use_size_class(block_group)) return 0; + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) { ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); if (ret < 0) @@ -682,7 +673,6 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl block_group->size_class = size_class; spin_unlock(&block_group->lock); } - out: return ret; } @@ -1185,14 +1175,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); - WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, - &block_group->runtime_flags) && - block_group->space_info->active_total_bytes - < block_group->length); } block_group->space_info->total_bytes -= block_group->length; - if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) - block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= @@ -1836,7 +1820,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used %llu%% unusable", - bg->start, div_u64(bg->used * 100, bg->length), + bg->start, + div64_u64(bg->used * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); @@ -2493,18 +2478,29 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group_item bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; + u64 old_commit_used; + int ret; spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + old_commit_used = block_group->commit_used; + block_group->commit_used = block_group->used; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); - return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (ret < 0) { + spin_lock(&block_group->lock); + block_group->commit_used = old_commit_used; + spin_unlock(&block_group->lock); + } + + return ret; } static int insert_dev_extent(struct btrfs_trans_handle *trans, @@ -3474,6 +3470,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + struct btrfs_space_info *space_info; bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); @@ -3481,6 +3478,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, ret = -ENOENT; break; } + space_info = cache->space_info; factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -3495,7 +3493,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, byte_in_group = bytenr - cache->start; WARN_ON(byte_in_group > cache->length); - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); if (btrfs_test_opt(info, SPACE_CACHE) && @@ -3508,24 +3506,24 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; cache->used = old_val; cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; - cache->space_info->disk_used += num_bytes * factor; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_unlock(&space_info->lock); } else { old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, - cache->space_info, num_bytes); - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; + btrfs_space_info_update_bytes_pinned(info, space_info, + num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_unlock(&space_info->lock); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0095c6e4c3d1..6b457b010cbc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1048,7 +1048,7 @@ again: * so there is only one iref. The case that several irefs are * in the same item doesn't exist. */ - btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, path); out: btrfs_release_delayed_iref(node); btrfs_release_path(path); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 317aeff6c1da..a6d77fe41e1a 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -56,11 +56,9 @@ #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) -/* Target completion latency of discarding all discardable extents */ -#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) -#define BTRFS_DISCARD_MAX_IOPS (10U) +#define BTRFS_DISCARD_MAX_IOPS (1000U) /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { @@ -577,6 +575,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) s32 discardable_extents; s64 discardable_bytes; u32 iops_limit; + unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; unsigned long delay; discardable_extents = atomic_read(&discard_ctl->discardable_extents); @@ -607,13 +606,19 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) } iops_limit = READ_ONCE(discard_ctl->iops_limit); - if (iops_limit) + + if (iops_limit) { delay = MSEC_PER_SEC / iops_limit; - else - delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; + } else { + /* + * Unset iops_limit means go as fast as possible, so allow a + * delay of 0. + */ + delay = 0; + min_delay = 0; + } - delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, - BTRFS_DISCARD_MAX_DELAY_MSEC); + delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b53f0e30ce2b..9e1596bb208d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2250,6 +2250,20 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + /* + * Check if the checksum implementation is a fast accelerated one. + * As-is this is a bit of a hack and should be replaced once the csum + * implementations provide that information themselves. + */ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; + default: + break; + } + btrfs_info(fs_info, "using %s (%s) checksum algorithm", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(csum_shash)); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index be94030e1dfb..138afa955370 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -763,7 +763,13 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto next; } + flags = em->flags; clear_bit(EXTENT_FLAG_PINNED, &em->flags); + /* + * In case we split the extent map, we want to preserve the + * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want + * it on the new extent maps. + */ clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); @@ -774,7 +780,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (em->start >= start && em_end <= end) goto remove_em; - flags = em->flags; gen = em->generation; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5cc5a1faaef5..f649647392e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3730,10 +3730,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, if (!iter_is_iovec(iter)) return 0; - for (seg = 0; seg < iter->nr_segs; seg++) - for (i = seg + 1; i < iter->nr_segs; i++) - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) return -EINVAL; + } + } return 0; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0d250d052487..d84cef89cdff 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2693,8 +2693,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); + /* Count initial region as zone_unusable until it gets activated. */ if (!used) to_free = size; + else if (initial && + test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && + (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + to_free = 0; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) @@ -2722,7 +2727,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length) { + if (block_group->zone_unusable == block_group->length && + block_group->alloc_offset) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 4c477eae6891..24cd49229408 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -120,11 +120,8 @@ enum { /* Indicate that we want to commit the transaction. */ BTRFS_FS_NEED_TRANS_COMMIT, - /* - * Indicate metadata over-commit is disabled. This is set when active - * zone tracking is needed. - */ - BTRFS_FS_NO_OVERCOMMIT, + /* This is set when active zone tracking is needed. */ + BTRFS_FS_ACTIVE_ZONE_TRACKING, /* * Indicate if we have some features changed, this is mostly for diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6c18dc9a1831..957e4d76a7b6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5421,8 +5421,13 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, return -ENOMEM; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); - if (ret) + if (ret < 0) goto out; + /* + * fscrypt_setup_filename() should never return a positive value, but + * gcc on sparc/parisc thinks it can, so assert that doesn't happen. + */ + ASSERT(ret == 0); /* This needs to handle no-key deletions later on */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 84626c8ad5bf..ba769a1eb87a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2859,6 +2859,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); if (dev->name) strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); else @@ -3731,7 +3732,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) } /* update qgroup status and info */ + mutex_lock(&fs_info->qgroup_ioctl_lock); err = btrfs_run_qgroups(trans); + mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) btrfs_handle_fs_error(fs_info, err, "failed to update qgroup status and info"); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 52a7d2fa2284..f41da7ac360d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2828,13 +2828,22 @@ cleanup: } /* - * called from commit_transaction. Writes all changed qgroups to disk. + * Writes all changed qgroups to disk. + * Called by the transaction commit path and the qgroup assign ioctl. */ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; + /* + * In case we are called from the qgroup assign ioctl, assert that we + * are holding the qgroup_ioctl_lock, otherwise we can race with a quota + * disable operation (ioctl) and access a freed quota root. + */ + if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) + lockdep_assert_held(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) return ret; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 69c09508afb5..3eecce86f63f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -308,8 +308,6 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, ASSERT(found); spin_lock(&found->lock); found->total_bytes += block_group->length; - if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) - found->active_total_bytes += block_group->length; found->disk_total += block_group->length * factor; found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; @@ -379,22 +377,6 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } -static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - /* - * On regular filesystem, all total_bytes are always writable. On zoned - * filesystem, there may be a limitation imposed by max_active_zones. - * For metadata allocation, we cannot finish an existing active block - * group to avoid a deadlock. Thus, we need to consider only the active - * groups to be writable for metadata space. - */ - if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) - return space_info->total_bytes; - - return space_info->active_total_bytes; -} - int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -407,13 +389,13 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && + if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) avail = 0; else avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) + if (used + bytes < space_info->total_bytes + avail) return 1; return 0; } @@ -449,7 +431,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || + if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -829,7 +811,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; - u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -844,9 +825,8 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - total = writable_total_bytes(fs_info, space_info); - if (total + avail < used) - to_reclaim += used - (total + avail); + if (space_info->total_bytes + avail < used) + to_reclaim += used - (space_info->total_bytes + avail); return to_reclaim; } @@ -856,11 +836,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 total = writable_total_bytes(fs_info, space_info); u64 thresh; u64 used; - thresh = mult_perc(total, 90); + thresh = mult_perc(space_info->total_bytes, 90); lockdep_assert_held(&space_info->lock); @@ -923,8 +902,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < total) - thresh += total - used; + if (used < space_info->total_bytes) + thresh += space_info->total_bytes - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1651,7 +1630,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || + ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); @@ -1665,8 +1644,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); - if (used + orig_bytes <= - writable_total_bytes(fs_info, space_info)) { + if (used + orig_bytes <= space_info->total_bytes) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index fc99ea2b0c34..2033b71b18ce 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -96,8 +96,6 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ - /* Total bytes in the space, but only accounts active block groups. */ - u64 active_total_bytes; u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 581845bc206a..366fb4cde145 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1516,8 +1516,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; - if (!strstr(crc32c_impl(), "generic")) - set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) @@ -1631,6 +1629,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_workers, new_pool_size); + workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8c5efa5813b3..37fc58a7f27e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -9,6 +9,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/bug.h> +#include <linux/list.h> #include <crypto/hash.h> #include "messages.h" #include "ctree.h" @@ -778,6 +779,45 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, return len; } +static ssize_t btrfs_size_classes_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + struct btrfs_block_group *bg; + u32 none = 0; + u32 small = 0; + u32 medium = 0; + u32 large = 0; + + for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { + down_read(&sinfo->groups_sem); + list_for_each_entry(bg, &sinfo->block_groups[i], list) { + if (!btrfs_block_group_should_use_size_class(bg)) + continue; + switch (bg->size_class) { + case BTRFS_BG_SZ_NONE: + none++; + break; + case BTRFS_BG_SZ_SMALL: + small++; + break; + case BTRFS_BG_SZ_MEDIUM: + medium++; + break; + case BTRFS_BG_SZ_LARGE: + large++; + break; + } + } + up_read(&sinfo->groups_sem); + } + return sysfs_emit(buf, "none %u\n" + "small %u\n" + "medium %u\n" + "large %u\n", + none, small, medium, large); +} + #ifdef CONFIG_BTRFS_DEBUG /* * Request chunk allocation with current chunk size. @@ -835,6 +875,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, @@ -887,6 +928,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, chunk_size), + BTRFS_ATTR_PTR(space_info, size_classes), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 18329ebcb1cb..b8d5b1fa9a03 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2035,7 +2035,20 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) if (current->journal_info == trans) current->journal_info = NULL; - btrfs_scrub_cancel(fs_info); + + /* + * If relocation is running, we can't cancel scrub because that will + * result in a deadlock. Before relocating a block group, relocation + * pauses scrub, then starts and commits a transaction before unpausing + * scrub. If the transaction commit is being done by the relocation + * task or triggered by another task and the relocation task is waiting + * for the commit, and we end up here due to an error in the commit + * path, then calling btrfs_scrub_cancel() will deadlock, as we are + * asking for scrub to stop while having it asked to be paused higher + * above in relocation code. + */ + if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + btrfs_scrub_cancel(fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7823168c08a6..c6d592870400 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1366,8 +1366,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ - flags |= FMODE_EXCL; + /* + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may + * initiate the device scan which may race with the user's mount + * or mkfs command, resulting in failure. + * Since the device scan is solely for reading purposes, there is + * no need for FMODE_EXCL. Additionally, the devices are read again + * during the mount process. It is ok to get some inconsistent + * values temporarily, as the device paths of the fsid are the only + * required information for assembling the volume. + */ bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -3266,8 +3275,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); btrfs_scrub_continue(fs_info); - if (ret) + if (ret) { + /* + * If we had a transaction abort, stop all running scrubs. + * See transaction.c:cleanup_transaction() why we do it here. + */ + if (BTRFS_FS_ERROR(fs_info)) + btrfs_scrub_cancel(fs_info); return ret; + } block_group = btrfs_lookup_block_group(fs_info, chunk_offset); if (!block_group) @@ -6363,7 +6379,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, ASSERT(op != BTRFS_MAP_DISCARD); em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; data_stripes = nr_data_stripes(map); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f95b2c94d619..45d04092f2f8 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -524,8 +524,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } atomic_set(&zone_info->active_zones_left, max_active_zones - nactive); - /* Overcommit does not work well with active zone tacking. */ - set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags); + set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } /* Validate superblock log */ @@ -1581,9 +1580,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - unusable = (cache->alloc_offset - cache->used) + - (cache->length - cache->zone_capacity); - free = cache->zone_capacity - cache->alloc_offset; + + /* Check for block groups never get activated */ + if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && + cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && + !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && + cache->alloc_offset == 0) { + unusable = cache->length; + free = 0; + } else { + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; + } /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; @@ -1902,7 +1911,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); - space_info->active_total_bytes += block_group->length; + WARN_ON(block_group->alloc_offset != 0); + if (block_group->zone_unusable == block_group->length) { + block_group->zone_unusable = block_group->length - block_group->zone_capacity; + space_info->bytes_zone_unusable -= block_group->zone_capacity; + } spin_unlock(&block_group->lock); btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); @@ -2086,11 +2099,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!device->bdev) continue; - if (!zinfo->max_active_zones || - atomic_read(&zinfo->active_zones_left)) { + if (!zinfo->max_active_zones) { ret = true; break; } + + switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + ret = (atomic_read(&zinfo->active_zones_left) >= 1); + break; + case BTRFS_BLOCK_GROUP_DUP: + ret = (atomic_read(&zinfo->active_zones_left) >= 2); + break; + } + if (ret) + break; } mutex_unlock(&fs_info->chunk_mutex); @@ -2256,7 +2279,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) u64 avail; spin_lock(&block_group->lock); - if (block_group->reserved || + if (block_group->reserved || block_group->alloc_offset == 0 || (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { spin_unlock(&block_group->lock); continue; @@ -2293,10 +2316,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) return 0; - /* No more block groups to activate */ - if (space_info->active_total_bytes == space_info->total_bytes) - return 0; - for (;;) { int ret; bool need_finish = false; |