diff options
Diffstat (limited to 'fs/btrfs/space-info.c')
-rw-r--r-- | fs/btrfs/space-info.c | 322 |
1 files changed, 278 insertions, 44 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 581bdd709ee0..d5a9cd8a4fd8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include "linux/spinlock.h" +#include <linux/minmax.h> #include "misc.h" #include "ctree.h" #include "space-info.h" @@ -9,7 +11,6 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -162,7 +163,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -191,6 +192,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -233,6 +236,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; + space_info->fs_info = info; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -341,12 +345,35 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } +static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo; + u64 data_chunk_size; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no + * more than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size) + * as it is. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + if (btrfs_is_zoned(fs_info)) + return data_sinfo->chunk_size; + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + return min_t(u64, data_chunk_size, SZ_1G); +} + static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; + u64 data_chunk_size; int factor; if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) @@ -364,6 +391,27 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, */ factor = btrfs_bg_type_to_factor(profile); avail = div_u64(avail, factor); + if (avail == 0) + return 0; + + data_chunk_size = calc_effective_data_chunk_size(fs_info); + + /* + * Since data allocations immediately use block groups as part of the + * reservation, because we assume that data reservations will == actual + * usage, we could potentially overcommit and then immediately have that + * available space used by a data allocation, which could put us in a + * bind when we get close to filling the file system. + * + * To handle this simply remove the data_chunk_size from the available + * space. If we are relatively empty this won't affect our ability to + * overcommit much, and if we're very close to full it'll keep us from + * getting into a position where we've given ourselves very little + * metadata wiggle room. + */ + if (avail <= data_chunk_size) + return 0; + avail -= data_chunk_size; /* * If we aren't flushing all things, let us overcommit up to @@ -374,11 +422,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -483,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -555,20 +614,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); - u64 nr; - - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - -#define EXTENT_SIZE_PER_ITEM SZ_256K - /* * shrink metadata reservation for delalloc */ @@ -668,7 +713,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, items, NULL); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -748,10 +793,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_delayed_refs_nr(fs_info, num_bytes); + btrfs_run_delayed_refs(trans, num_bytes); else - nr = 0; - btrfs_run_delayed_refs(trans, nr); + btrfs_run_delayed_refs(trans, 0); btrfs_end_transaction(trans); break; case ALLOC_CHUNK: @@ -788,14 +832,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, * because that does not wait for a transaction to fully commit * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). */ - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - if (ret == -ENOENT) - ret = 0; - break; - } - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(root); break; default: ret = -ENOSPC; @@ -807,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -834,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -977,7 +1013,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, } /* - * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * We've exhausted our flushing, start failing tickets. + * * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * @@ -1741,7 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem - * @block_rsv: block_rsv we're allocating for + * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * @@ -1753,21 +1790,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * space already. */ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); + ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); + space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); } return ret; } @@ -1850,3 +1885,202 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } + +static u64 calc_pct_ratio(u64 x, u64 y) +{ + int err; + + if (!y) + return 0; +again: + err = check_mul_overflow(100, x, &x); + if (err) + goto lose_precision; + return div64_u64(x, y); +lose_precision: + x >>= 10; + y >>= 10; + if (!y) + y = 1; + goto again; +} + +/* + * A reasonable buffer for unallocated space is 10 data block_groups. + * If we claw this back repeatedly, we can still achieve efficient + * utilization when near full, and not do too much reclaim while + * always maintaining a solid buffer for workloads that quickly + * allocate and pressure the unallocated space. + */ +static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) +{ + u64 chunk_sz = calc_effective_data_chunk_size(fs_info); + + return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz; +} + +/* + * The fundamental goal of automatic reclaim is to protect the filesystem's + * unallocated space and thus minimize the probability of the filesystem going + * read only when a metadata allocation failure causes a transaction abort. + * + * However, relocations happen into the space_info's unused space, therefore + * automatic reclaim must also back off as that space runs low. There is no + * value in doing trivial "relocations" of re-writing the same block group + * into a fresh one. + * + * Furthermore, we want to avoid doing too much reclaim even if there are good + * candidates. This is because the allocator is pretty good at filling up the + * holes with writes. So we want to do just enough reclaim to try and stay + * safe from running out of unallocated space but not be wasteful about it. + * + * Therefore, the dynamic reclaim threshold is calculated as follows: + * - calculate a target unallocated amount of 5 block group sized chunks + * - ratchet up the intensity of reclaim depending on how far we are from + * that target by using a formula of unalloc / target to set the threshold. + * + * Typically with 10 block groups as the target, the discrete values this comes + * out to are 0, 10, 20, ... , 80, 90, and 99. + */ +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 target = calc_unalloc_target(fs_info); + u64 alloc = space_info->total_bytes; + u64 used = btrfs_space_info_used(space_info, false); + u64 unused = alloc - used; + u64 want = target > unalloc ? target - unalloc : 0; + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + /* If we have no unused space, don't bother, it won't work anyway. */ + if (unused < data_chunk_size) + return 0; + + /* Cast to int is OK because want <= target. */ + return calc_pct_ratio(want, target); +} + +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + lockdep_assert_held(&space_info->lock); + + if (READ_ONCE(space_info->dynamic_reclaim)) + return calc_dynamic_reclaim_threshold(space_info); + return READ_ONCE(space_info->bg_reclaim_threshold); +} + +/* + * Under "urgent" reclaim, we will reclaim even fresh block groups that have + * recently seen successful allocations, as we are desperate to reclaim + * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. + */ +static bool is_reclaim_urgent(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + return unalloc < data_chunk_size; +} + +static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, int raid) +{ + struct btrfs_block_group *bg; + int thresh_pct; + bool try_again = true; + bool urgent; + + spin_lock(&space_info->lock); + urgent = is_reclaim_urgent(space_info); + thresh_pct = btrfs_calc_reclaim_threshold(space_info); + spin_unlock(&space_info->lock); + + down_read(&space_info->groups_sem); +again: + list_for_each_entry(bg, &space_info->block_groups[raid], list) { + u64 thresh; + bool reclaim = false; + + btrfs_get_block_group(bg); + spin_lock(&bg->lock); + thresh = mult_perc(bg->length, thresh_pct); + if (bg->used < thresh && bg->reclaim_mark) { + try_again = false; + reclaim = true; + } + bg->reclaim_mark++; + spin_unlock(&bg->lock); + if (reclaim) + btrfs_mark_bg_to_reclaim(bg); + btrfs_put_block_group(bg); + } + + /* + * In situations where we are very motivated to reclaim (low unalloc) + * use two passes to make the reclaim mark check best effort. + * + * If we have any staler groups, we don't touch the fresher ones, but if we + * really need a block group, do take a fresh one. + */ + if (try_again && urgent) { + try_again = false; + goto again; + } + + up_read(&space_info->groups_sem); +} + +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) +{ + u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info); + + lockdep_assert_held(&space_info->lock); + space_info->reclaimable_bytes += bytes; + + if (space_info->reclaimable_bytes >= chunk_sz) + btrfs_set_periodic_reclaim_ready(space_info, true); +} + +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready) +{ + lockdep_assert_held(&space_info->lock); + if (!READ_ONCE(space_info->periodic_reclaim)) + return; + if (ready != space_info->periodic_reclaim_ready) { + space_info->periodic_reclaim_ready = ready; + if (!ready) + space_info->reclaimable_bytes = 0; + } +} + +bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +{ + bool ret; + + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return false; + if (!READ_ONCE(space_info->periodic_reclaim)) + return false; + + spin_lock(&space_info->lock); + ret = space_info->periodic_reclaim_ready; + btrfs_set_periodic_reclaim_ready(space_info, false); + spin_unlock(&space_info->lock); + + return ret; +} + +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) +{ + int raid; + struct btrfs_space_info *space_info; + + list_for_each_entry(space_info, &fs_info->space_info, list) { + if (!btrfs_should_periodic_reclaim(space_info)) + continue; + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(fs_info, space_info, raid); + } +} |