summaryrefslogtreecommitdiff
path: root/fs/btrfs/space-info.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/space-info.c')
-rw-r--r--fs/btrfs/space-info.c322
1 files changed, 278 insertions, 44 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 581bdd709ee0..d5a9cd8a4fd8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include "linux/spinlock.h"
+#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
#include "space-info.h"
@@ -9,7 +11,6 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -162,7 +163,7 @@
* thing with or without extra unallocated space.
*/
-u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
bool may_use_included)
{
ASSERT(s_info);
@@ -191,6 +192,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
+
/*
* Calculate chunk size depending on volume type (regular or zoned).
*/
@@ -233,6 +236,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
+ space_info->fs_info = info;
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -341,12 +345,35 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
return NULL;
}
+static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo;
+ u64 data_chunk_size;
+
+ /*
+ * Calculate the data_chunk_size, space_info->chunk_size is the
+ * "optimal" chunk size based on the fs size. However when we actually
+ * allocate the chunk we will strip this down further, making it no
+ * more than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size)
+ * as it is.
+ */
+ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ if (btrfs_is_zoned(fs_info))
+ return data_sinfo->chunk_size;
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ return min_t(u64, data_chunk_size, SZ_1G);
+}
+
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+ const struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
+ u64 data_chunk_size;
int factor;
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
@@ -364,6 +391,27 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
*/
factor = btrfs_bg_type_to_factor(profile);
avail = div_u64(avail, factor);
+ if (avail == 0)
+ return 0;
+
+ data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ /*
+ * Since data allocations immediately use block groups as part of the
+ * reservation, because we assume that data reservations will == actual
+ * usage, we could potentially overcommit and then immediately have that
+ * available space used by a data allocation, which could put us in a
+ * bind when we get close to filling the file system.
+ *
+ * To handle this simply remove the data_chunk_size from the available
+ * space. If we are relatively empty this won't affect our ability to
+ * overcommit much, and if we're very close to full it'll keep us from
+ * getting into a position where we've given ourselves very little
+ * metadata wiggle room.
+ */
+ if (avail <= data_chunk_size)
+ return 0;
+ avail -= data_chunk_size;
/*
* If we aren't flushing all things, let us overcommit up to
@@ -374,11 +422,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
+ const struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
u64 avail;
@@ -483,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
-static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info)
+static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *info)
{
const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
@@ -555,20 +614,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
return nr;
}
-static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
- u64 to_reclaim)
-{
- const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
- u64 nr;
-
- nr = div64_u64(to_reclaim, bytes);
- if (!nr)
- nr = 1;
- return nr;
-}
-
-#define EXTENT_SIZE_PER_ITEM SZ_256K
-
/*
* shrink metadata reservation for delalloc
*/
@@ -668,7 +713,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
skip_async:
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, items, NULL);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -748,10 +793,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
- nr = calc_delayed_refs_nr(fs_info, num_bytes);
+ btrfs_run_delayed_refs(trans, num_bytes);
else
- nr = 0;
- btrfs_run_delayed_refs(trans, nr);
+ btrfs_run_delayed_refs(trans, 0);
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
@@ -788,14 +832,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* because that does not wait for a transaction to fully commit
* (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- break;
- }
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
break;
default:
ret = -ENOSPC;
@@ -807,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
return;
}
-static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *space_info)
{
u64 used;
u64 avail;
@@ -834,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
}
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+ const struct btrfs_space_info *space_info)
{
const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
@@ -977,7 +1013,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
}
/*
- * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * We've exhausted our flushing, start failing tickets.
+ *
* @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
@@ -1741,7 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
- * @block_rsv: block_rsv we're allocating for
+ * @space_info: the space_info we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
*
@@ -1753,21 +1790,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* space already.
*/
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
- ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- block_rsv->space_info->flags,
- orig_bytes, 1);
+ space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, block_rsv->space_info,
- orig_bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
}
return ret;
}
@@ -1850,3 +1885,202 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
+
+static u64 calc_pct_ratio(u64 x, u64 y)
+{
+ int err;
+
+ if (!y)
+ return 0;
+again:
+ err = check_mul_overflow(100, x, &x);
+ if (err)
+ goto lose_precision;
+ return div64_u64(x, y);
+lose_precision:
+ x >>= 10;
+ y >>= 10;
+ if (!y)
+ y = 1;
+ goto again;
+}
+
+/*
+ * A reasonable buffer for unallocated space is 10 data block_groups.
+ * If we claw this back repeatedly, we can still achieve efficient
+ * utilization when near full, and not do too much reclaim while
+ * always maintaining a solid buffer for workloads that quickly
+ * allocate and pressure the unallocated space.
+ */
+static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
+
+ return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
+}
+
+/*
+ * The fundamental goal of automatic reclaim is to protect the filesystem's
+ * unallocated space and thus minimize the probability of the filesystem going
+ * read only when a metadata allocation failure causes a transaction abort.
+ *
+ * However, relocations happen into the space_info's unused space, therefore
+ * automatic reclaim must also back off as that space runs low. There is no
+ * value in doing trivial "relocations" of re-writing the same block group
+ * into a fresh one.
+ *
+ * Furthermore, we want to avoid doing too much reclaim even if there are good
+ * candidates. This is because the allocator is pretty good at filling up the
+ * holes with writes. So we want to do just enough reclaim to try and stay
+ * safe from running out of unallocated space but not be wasteful about it.
+ *
+ * Therefore, the dynamic reclaim threshold is calculated as follows:
+ * - calculate a target unallocated amount of 5 block group sized chunks
+ * - ratchet up the intensity of reclaim depending on how far we are from
+ * that target by using a formula of unalloc / target to set the threshold.
+ *
+ * Typically with 10 block groups as the target, the discrete values this comes
+ * out to are 0, 10, 20, ... , 80, 90, and 99.
+ */
+static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 target = calc_unalloc_target(fs_info);
+ u64 alloc = space_info->total_bytes;
+ u64 used = btrfs_space_info_used(space_info, false);
+ u64 unused = alloc - used;
+ u64 want = target > unalloc ? target - unalloc : 0;
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ /* If we have no unused space, don't bother, it won't work anyway. */
+ if (unused < data_chunk_size)
+ return 0;
+
+ /* Cast to int is OK because want <= target. */
+ return calc_pct_ratio(want, target);
+}
+
+int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info)
+{
+ lockdep_assert_held(&space_info->lock);
+
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return calc_dynamic_reclaim_threshold(space_info);
+ return READ_ONCE(space_info->bg_reclaim_threshold);
+}
+
+/*
+ * Under "urgent" reclaim, we will reclaim even fresh block groups that have
+ * recently seen successful allocations, as we are desperate to reclaim
+ * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
+ */
+static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ return unalloc < data_chunk_size;
+}
+
+static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, int raid)
+{
+ struct btrfs_block_group *bg;
+ int thresh_pct;
+ bool try_again = true;
+ bool urgent;
+
+ spin_lock(&space_info->lock);
+ urgent = is_reclaim_urgent(space_info);
+ thresh_pct = btrfs_calc_reclaim_threshold(space_info);
+ spin_unlock(&space_info->lock);
+
+ down_read(&space_info->groups_sem);
+again:
+ list_for_each_entry(bg, &space_info->block_groups[raid], list) {
+ u64 thresh;
+ bool reclaim = false;
+
+ btrfs_get_block_group(bg);
+ spin_lock(&bg->lock);
+ thresh = mult_perc(bg->length, thresh_pct);
+ if (bg->used < thresh && bg->reclaim_mark) {
+ try_again = false;
+ reclaim = true;
+ }
+ bg->reclaim_mark++;
+ spin_unlock(&bg->lock);
+ if (reclaim)
+ btrfs_mark_bg_to_reclaim(bg);
+ btrfs_put_block_group(bg);
+ }
+
+ /*
+ * In situations where we are very motivated to reclaim (low unalloc)
+ * use two passes to make the reclaim mark check best effort.
+ *
+ * If we have any staler groups, we don't touch the fresher ones, but if we
+ * really need a block group, do take a fresh one.
+ */
+ if (try_again && urgent) {
+ try_again = false;
+ goto again;
+ }
+
+ up_read(&space_info->groups_sem);
+}
+
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
+
+ lockdep_assert_held(&space_info->lock);
+ space_info->reclaimable_bytes += bytes;
+
+ if (space_info->reclaimable_bytes >= chunk_sz)
+ btrfs_set_periodic_reclaim_ready(space_info, true);
+}
+
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
+{
+ lockdep_assert_held(&space_info->lock);
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return;
+ if (ready != space_info->periodic_reclaim_ready) {
+ space_info->periodic_reclaim_ready = ready;
+ if (!ready)
+ space_info->reclaimable_bytes = 0;
+ }
+}
+
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+{
+ bool ret;
+
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return false;
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return false;
+
+ spin_lock(&space_info->lock);
+ ret = space_info->periodic_reclaim_ready;
+ btrfs_set_periodic_reclaim_ready(space_info, false);
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
+{
+ int raid;
+ struct btrfs_space_info *space_info;
+
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ if (!btrfs_should_periodic_reclaim(space_info))
+ continue;
+ for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
+ do_reclaim_sweep(fs_info, space_info, raid);
+ }
+}