summaryrefslogtreecommitdiff
path: root/fs/btrfs/space-info.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/space-info.c')
-rw-r--r--fs/btrfs/space-info.c255
1 files changed, 184 insertions, 71 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d5a9cd8a4fd8..d9087aa81b21 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/spinlock.h"
+#include <linux/spinlock.h>
#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
@@ -14,6 +14,7 @@
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
+#include "zoned.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -49,11 +50,11 @@
* num_bytes we want to reserve.
*
* ->reserve
- * space_info->bytes_may_reserve += num_bytes
+ * space_info->bytes_may_use += num_bytes
*
* ->extent allocation
* Call btrfs_add_reserved_bytes() which does
- * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_may_use -= num_bytes
* space_info->bytes_reserved += extent_bytes
*
* ->insert reference
@@ -127,6 +128,14 @@
* churn a lot and we can avoid making some extent tree modifications if we
* are able to delay for as long as possible.
*
+ * RESET_ZONES
+ * This state works only for the zoned mode. On the zoned mode, we cannot
+ * reuse once allocated then freed region until we reset the zone, due to
+ * the sequential write zone requirement. The RESET_ZONES state resets the
+ * zones of an unused block group and let us reuse the space. The reusing
+ * is faster than removing the block group and allocating another block
+ * group on the zones.
+ *
* ALLOC_CHUNK
* We will skip this the first time through space reservation, because of
* overcommit and we don't want to have a lot of useless metadata space when
@@ -225,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
WRITE_ONCE(space_info->chunk_size, chunk_size);
}
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+static void init_space_info(struct btrfs_fs_info *info,
+ struct btrfs_space_info *space_info, u64 flags)
{
-
- struct btrfs_space_info *space_info;
- int i;
- int ret;
-
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
- if (!space_info)
- return -ENOMEM;
-
space_info->fs_info = info;
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
spin_lock_init(&space_info->lock);
@@ -248,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+ space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY;
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+}
+
+static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags,
+ enum btrfs_space_info_sub_group id, int index)
+{
+ struct btrfs_fs_info *fs_info = parent->fs_info;
+ struct btrfs_space_info *sub_group;
+ int ret;
+
+ ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
+ ASSERT(id != BTRFS_SUB_GROUP_PRIMARY);
+
+ sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS);
+ if (!sub_group)
+ return -ENOMEM;
+
+ init_space_info(fs_info, sub_group, flags);
+ parent->sub_group[index] = sub_group;
+ sub_group->parent = parent;
+ sub_group->subgroup_id = id;
+
+ ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group);
+ if (ret) {
+ kfree(sub_group);
+ parent->sub_group[index] = NULL;
+ }
+ return ret;
+}
+
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+ struct btrfs_space_info *space_info;
+ int ret = 0;
+
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
+ return -ENOMEM;
+
+ init_space_info(info, space_info, flags);
+
+ if (btrfs_is_zoned(info)) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ 0);
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_TREELOG,
+ 0);
+
+ if (ret)
+ return ret;
+ }
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
@@ -303,31 +359,29 @@ out:
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group)
{
- struct btrfs_space_info *found;
+ struct btrfs_space_info *space_info = block_group->space_info;
int factor, index;
factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, block_group->flags);
- ASSERT(found);
- spin_lock(&found->lock);
- found->total_bytes += block_group->length;
- found->disk_total += block_group->length * factor;
- found->bytes_used += block_group->used;
- found->disk_used += block_group->used * factor;
- found->bytes_readonly += block_group->bytes_super;
- btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
+ spin_lock(&space_info->lock);
+ space_info->total_bytes += block_group->length;
+ space_info->disk_total += block_group->length * factor;
+ space_info->bytes_used += block_group->used;
+ space_info->disk_used += block_group->used * factor;
+ space_info->bytes_readonly += block_group->bytes_super;
+ btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable);
if (block_group->length > 0)
- found->full = 0;
- btrfs_try_granting_tickets(info, found);
- spin_unlock(&found->lock);
+ space_info->full = 0;
+ btrfs_try_granting_tickets(info, space_info);
+ spin_unlock(&space_info->lock);
- block_group->space_info = found;
+ block_group->space_info = space_info;
index = btrfs_bg_flags_to_raid_index(block_group->flags);
- down_write(&found->groups_sem);
- list_add_tail(&block_group->list, &found->block_groups[index]);
- up_write(&found->groups_sem);
+ down_write(&space_info->groups_sem);
+ list_add_tail(&block_group->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -489,9 +543,7 @@ again:
if ((used + ticket->bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
- btrfs_space_info_update_bytes_may_use(fs_info,
- space_info,
- ticket->bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
@@ -549,8 +601,9 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
- flag_str,
+ btrfs_info(fs_info,
+ "space_info %s (sub-group id %d) has %lld free, is %sfull",
+ flag_str, info->subgroup_id,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
@@ -805,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_chunk_alloc(trans,
+ ret = btrfs_chunk_alloc(trans, space_info,
btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
@@ -834,6 +887,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
*/
ret = btrfs_commit_current_transaction(root);
break;
+ case RESET_ZONES:
+ ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
+ break;
default:
ret = -ENOSPC;
break;
@@ -1073,22 +1129,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
return (tickets_id != space_info->tickets_id);
}
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to. We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 to_reclaim;
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
+ enum btrfs_flush_state final_state;
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ if (btrfs_is_zoned(fs_info))
+ final_state = RESET_ZONES;
+ else
+ final_state = COMMIT_TRANS;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
@@ -1141,7 +1194,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
flush_state++;
- if (flush_state > COMMIT_TRANS) {
+ if (flush_state > final_state) {
commit_cycles++;
if (commit_cycles > 2) {
if (maybe_fail_all_tickets(fs_info, space_info)) {
@@ -1155,7 +1208,26 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
}
spin_unlock(&space_info->lock);
- } while (flush_state <= COMMIT_TRANS);
+ } while (flush_state <= final_state);
+}
+
+/*
+ * This is for normal flushers, it can wait as much time as needed. We will
+ * loop and continuously try to flush as long as we are making progress. We
+ * count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ do_async_reclaim_metadata_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i])
+ do_async_reclaim_metadata_space(space_info->sub_group[i]);
+ }
}
/*
@@ -1279,13 +1351,17 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* If we are freeing inodes, we want to make sure all delayed iputs have
* completed, because they could have been on an inode with i_nlink == 0, and
* thus have been truncated and freed up space. But again this space is not
- * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * immediately reusable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
* COMMIT_TRANS
* This is where we reclaim all of the pinned space generated by running the
* iputs
*
+ * RESET_ZONES
+ * This state works only for the zoned mode. We scan the unused block group
+ * list and reset the zones and reuse the block group.
+ *
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
* before, and then the transaction commit could have freed new block groups,
@@ -1295,19 +1371,16 @@ static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
+ RESET_ZONES,
ALLOC_CHUNK_FORCE,
};
-static void btrfs_async_reclaim_data_space(struct work_struct *work)
+static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 last_tickets_id;
enum btrfs_flush_state flush_state = 0;
- fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
- space_info = fs_info->data_sinfo;
-
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -1375,6 +1448,19 @@ aborted_fs:
spin_unlock(&space_info->lock);
}
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+ do_async_reclaim_data_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++)
+ if (space_info->sub_group[i])
+ do_async_reclaim_data_space(space_info->sub_group[i]);
+}
+
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
@@ -1386,6 +1472,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
static const enum btrfs_flush_state priority_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
+ RESET_ZONES,
ALLOC_CHUNK,
};
@@ -1399,6 +1486,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
+ RESET_ZONES,
};
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
@@ -1488,8 +1576,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
-static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void wait_reserve_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
@@ -1547,7 +1634,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
- wait_reserve_ticket(fs_info, space_info, ticket);
+ wait_reserve_ticket(space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
priority_reclaim_metadata_space(fs_info, space_info, ticket,
@@ -1691,8 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
- btrfs_space_info_update_bytes_may_use(fs_info, space_info,
- orig_bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
@@ -1704,8 +1790,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
used = btrfs_space_info_used(space_info, false);
if (used + orig_bytes <= space_info->total_bytes) {
- btrfs_space_info_update_bytes_may_use(fs_info, space_info,
- orig_bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
}
@@ -1817,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
*/
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
@@ -1828,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
flush == BTRFS_RESERVE_NO_FLUSH);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
- ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ ret = __reserve_bytes(fs_info, space_info, bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- data_sinfo->flags, bytes, 1);
+ space_info->flags, bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, bytes, 0);
}
return ret;
}
@@ -1984,8 +2069,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size;
}
-static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
{
struct btrfs_block_group *bg;
int thresh_pct;
@@ -2081,6 +2165,35 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
if (!btrfs_should_periodic_reclaim(space_info))
continue;
for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
- do_reclaim_sweep(fs_info, space_info, raid);
+ do_reclaim_sweep(space_info, raid);
+ }
+}
+
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+ lockdep_assert_held(&space_info->lock);
+
+ /* Prioritize the global reservation to receive the freed space. */
+ if (global_rsv->space_info != space_info)
+ goto grant;
+
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
+
+ global_rsv->reserved += to_add;
+ btrfs_space_info_update_bytes_may_use(space_info, to_add);
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ len -= to_add;
}
+ spin_unlock(&global_rsv->lock);
+
+grant:
+ /* Add to any tickets we may have. */
+ if (len)
+ btrfs_try_granting_tickets(fs_info, space_info);
}