summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.h4
-rw-r--r--fs/btrfs/block-group.c48
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c1
-rw-r--r--fs/btrfs/direct-io.c4
-rw-r--r--fs/btrfs/discard.c51
-rw-r--r--fs/btrfs/disk-io.c33
-rw-r--r--fs/btrfs/extent-io-tree.c6
-rw-r--r--fs/btrfs/extent-tree.c33
-rw-r--r--fs/btrfs/extent_io.c9
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c132
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c21
-rw-r--r--fs/btrfs/free-space-tree.c16
-rw-r--r--fs/btrfs/fs.h3
-rw-r--r--fs/btrfs/inode.c356
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/ordered-data.c14
-rw-r--r--fs/btrfs/qgroup.c13
-rw-r--r--fs/btrfs/raid56.c5
-rw-r--r--fs/btrfs/relocation.c19
-rw-r--r--fs/btrfs/scrub.c38
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/super.c17
-rw-r--r--fs/btrfs/tests/extent-io-tests.c6
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-log.c484
-rw-r--r--fs/btrfs/volumes.c97
-rw-r--r--fs/btrfs/zoned.c5
-rw-r--r--fs/btrfs/zstd.c2
31 files changed, 819 insertions, 623 deletions
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e8c22cccb5c1..7dfcc9351bce 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -427,8 +427,8 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
-#define LINK_LOWER (1 << 0)
-#define LINK_UPPER (1 << 1)
+#define LINK_LOWER (1U << 0)
+#define LINK_UPPER (1U << 1)
void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
struct btrfs_backref_node *lower,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 4423d8b716a5..7eef79ece5b3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group
}
#endif
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
+
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
@@ -1249,6 +1262,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
/*
@@ -1567,8 +1589,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if (space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) {
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
@@ -1891,6 +1914,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the block group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore. We also
+ * cache it before unlocking the block group, to prevent races
+ * (reports from KCSAN and such tools) with tasks updating it.
+ */
+ zone_unusable = bg->zone_unusable;
+
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
@@ -1907,13 +1941,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
@@ -2776,8 +2803,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ spin_unlock(&fs_info->unused_bgs_lock);
/*
* If the block group is still unused, add it to the list of
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 40332ab62f10..65d883da86c6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -606,7 +606,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
free_extent_map(em);
cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
if (!cb->compressed_folios) {
ret = BLK_STS_RESOURCE;
goto out_free_bio;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 29c164597401..3ba15d9c3e88 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2901,6 +2901,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (ret < 0) {
int ret2;
+ btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
if (ret2 < 0)
btrfs_abort_transaction(trans, ret2);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index bd38df5647e3..71984d7db839 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -151,8 +151,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
}
ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT));
+ (1U << type) |
+ (1U << BTRFS_ORDERED_DIRECT));
if (IS_ERR(ordered)) {
if (em) {
free_extent_map(em);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e815d165cccc..de23c4b3515e 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
lockdep_assert_held(&discard_ctl->lock);
- if (!btrfs_run_discard_work(discard_ctl))
- return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
@@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
if (!btrfs_is_block_group_data_only(block_group))
return;
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
@@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -250,6 +245,18 @@ again:
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
+ /*
+ * The block group must have been moved to other
+ * discard list even if discard was disabled in
+ * the meantime or a transaction abort happened,
+ * otherwise we can end up in an infinite loop,
+ * always jumping into the 'again' label and
+ * keep getting this block group over and over
+ * in case there are no other block groups in
+ * the discard lists.
+ */
+ ASSERT(block_group->discard_index !=
+ BTRFS_DISCARD_INDEX_UNUSED);
} else {
list_del_init(&block_group->discard_list);
btrfs_put_block_group(block_group);
@@ -260,9 +267,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -493,9 +501,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
+ return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -547,15 +566,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19e5f8eaae77..e655fa3bfd9b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2168,8 +2168,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_scrub(fs_info);
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
+ btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -4255,6 +4255,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4275,6 +4283,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
* When finishing a compressed write bio we schedule a work queue item
* to finish an ordered extent - btrfs_finish_compressed_write_work()
* calls btrfs_finish_ordered_extent() which in turns does a call to
@@ -4314,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->extent_map_shrinker_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4343,9 +4365,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4468,10 +4487,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 6d08c100b01d..5f9a43734812 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1252,8 +1252,11 @@ hit_next:
if (!prealloc)
goto search_again;
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1456,6 +1459,7 @@ hit_next:
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ceffbef3298..bb3602059906 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3654,6 +3654,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
/*
* Helper function for find_free_extent().
*
@@ -3675,7 +3690,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro ||
- !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ !block_group_bits(cluster_bg, ffe_ctl->flags) ||
+ !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
@@ -4231,21 +4247,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group *bg)
-{
- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
- return true;
- if (!btrfs_block_group_should_use_size_class(bg))
- return true;
- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
- return true;
- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
- bg->size_class == BTRFS_BG_SZ_NONE)
- return true;
- return ffe_ctl->size_class == bg->size_class;
-}
-
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6551fb003eed..d322cf82783f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1870,7 +1870,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&folio->mapping->i_private_lock);
- bit_start++;
+ bit_start += sectors_per_node;
continue;
}
@@ -2826,10 +2826,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -2865,8 +2865,11 @@ again:
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8a36117ed453..039a73731135 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -79,7 +79,7 @@ enum {
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
-#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
@@ -293,6 +293,8 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
*/
static inline int num_extent_folios(const struct extent_buffer *eb)
{
+ if (!eb->folios[0])
+ return 0;
if (folio_order(eb->folios[0]))
return 1;
return num_extent_pages(eb);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1d93e1202c33..36af9aa9aab1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1128,11 +1128,14 @@ struct btrfs_em_shrink_ctx {
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
- const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0;
struct rb_node *node;
+ lockdep_assert_held_write(&tree->lock);
+
/*
* Take the mmap lock so that we serialize with the inode logging phase
* of fsync because we may need to set the full sync flag on the inode,
@@ -1144,28 +1147,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* to find new extents, which may not be there yet because ordered
* extents haven't completed yet.
*
- * We also do a try lock because otherwise we could deadlock. This is
- * because the shrinker for this filesystem may be invoked while we are
- * in a path that is holding the mmap lock in write mode. For example in
- * a reflink operation while COWing an extent buffer, when allocating
- * pages for a new extent buffer and under memory pressure, the shrinker
- * may be invoked, and therefore we would deadlock by attempting to read
- * lock the mmap lock while we are holding already a write lock on it.
+ * We also do a try lock because we don't want to block for too long and
+ * we are holding the extent map tree's lock in write mode.
*/
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- /*
- * We want to be fast so if the lock is busy we don't want to spend time
- * waiting for it - either some task is about to do IO for the inode or
- * we may have another task shrinking extent maps, here in this code, so
- * skip this inode.
- */
- if (!write_trylock(&tree->lock)) {
- up_read(&inode->i_mmap_lock);
- return 0;
- }
-
node = rb_first(&tree->root);
while (node) {
struct rb_node *next = rb_next(node);
@@ -1201,36 +1188,89 @@ next:
* lock. This is to avoid slowing other tasks trying to take the
* lock.
*/
- if (need_resched() || rwlock_needbreak(&tree->lock))
+ if (need_resched() || rwlock_needbreak(&tree->lock) ||
+ btrfs_fs_closing(fs_info))
break;
node = next;
}
- write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
return nr_dropped;
}
+static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
+ u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ struct extent_map_tree *tree;
+
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+
+ tree = &inode->extent_tree;
+
+ /*
+ * We want to be fast so if the lock is busy we don't want to
+ * spend time waiting for it (some task is about to do IO for
+ * the inode).
+ */
+ if (!write_trylock(&tree->lock))
+ goto next;
+
+ /*
+ * Skip inode if it doesn't have loaded extent maps, so we avoid
+ * getting a reference and doing an iput later. This includes
+ * cases like files that were opened for things like stat(2), or
+ * files with all extent maps previously released through the
+ * release folio callback (btrfs_release_folio()) or released in
+ * a previous run, or directories which never have extent maps.
+ */
+ if (RB_EMPTY_ROOT(&tree->root)) {
+ write_unlock(&tree->lock);
+ goto next;
+ }
+
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ write_unlock(&tree->lock);
+next:
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
u64 min_ino = ctx->last_ino + 1;
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
+ write_unlock(&inode->extent_tree.lock);
min_ino = btrfs_ino(inode) + 1;
ctx->last_ino = btrfs_ino(inode);
- btrfs_add_delayed_iput(inode);
+ iput(&inode->vfs_inode);
- if (ctx->scanned >= ctx->nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan ||
+ btrfs_fs_closing(fs_info))
break;
cond_resched();
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
}
if (inode) {
@@ -1254,16 +1294,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
return nr_dropped;
}
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
+ fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+
ctx.scanned = 0;
- ctx.nr_to_scan = nr_to_scan;
+ ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
/*
* In case we have multiple tasks running this shrinker, make the next
@@ -1281,12 +1324,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
nr, ctx.last_root,
ctx.last_ino);
}
- while (ctx.scanned < ctx.nr_to_scan) {
+ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root;
unsigned long count;
@@ -1344,5 +1387,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
ctx.last_ino);
}
- return nr_dropped;
+ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ /*
+ * Do nothing if the shrinker is already running. In case of high memory
+ * pressure we can have a lot of tasks calling us and all passing the
+ * same nr_to_scan value, but in reality we may need only to free
+ * nr_to_scan extent maps (or less). In case we need to free more than
+ * that, we will be called again by the fs shrinker, so no worries about
+ * not doing enough work to reclaim memory from extent maps.
+ * We can also be repeatedly called with the same nr_to_scan value
+ * simply because the shrinker runs asynchronously and multiple calls
+ * to this function are made before the shrinker does enough progress.
+ *
+ * That's why we set the atomic counter to nr_to_scan only if its
+ * current value is zero, instead of incrementing the counter by
+ * nr_to_scan.
+ */
+ if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+ return;
+
+ queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+ INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f1d26c..cd123b266b64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eaa991e69804..0e63603ac5c7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1912,6 +1912,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
+ size_t fsize = folio_size(folio);
vm_fault_t ret;
int ret2;
int reserved = 0;
@@ -1922,7 +1923,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ASSERT(folio_order(folio) == 0);
- reserved_space = PAGE_SIZE;
+ reserved_space = fsize;
sb_start_pagefault(inode->i_sb);
page_start = folio_pos(folio);
@@ -1976,7 +1977,7 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
if (ordered) {
unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
@@ -1988,11 +1989,11 @@ again:
if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
+ if (reserved_space < fsize) {
end = page_start + reserved_space - 1;
btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
+ data_reserved, end + 1,
+ fsize - reserved_space, true);
}
}
@@ -2019,12 +2020,12 @@ again:
if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size);
else
- zero_start = PAGE_SIZE;
+ zero_start = fsize;
- if (zero_start != PAGE_SIZE)
+ if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
@@ -2033,7 +2034,7 @@ again:
unlock_extent(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -2042,7 +2043,7 @@ out_unlock:
folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7ba50e133921..308abbf8855b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1104,11 +1104,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature, so block group
+ * items are stored in the block group tree. It also means there are no
+ * extents allocated for block groups with a start offset beyond this
+ * block group's end offset (this is the last, highest, block group).
+ */
+ if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
+ ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1138,8 +1148,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
ret = __add_to_free_space_tree(trans, block_group, path2,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index cbfb225858a5..374843aca60d 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -285,6 +285,7 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_WARNING_COMMIT_INTERVAL (300)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
struct btrfs_dev_replace {
@@ -638,6 +639,8 @@ struct btrfs_fs_info {
spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;
+ atomic64_t extent_map_shrinker_nr_to_scan;
+ struct work_struct extent_map_shrinker_work;
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b842276573e..f84e3f9fad84 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1187,6 +1187,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1207,7 +1208,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1223,6 +1227,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1244,7 +1249,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_COMPRESSED);
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1264,6 +1269,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
@@ -1402,6 +1409,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
+ * We're not doing compressed IO, don't unlock the first page (which
+ * the caller expects to stay locked), don't clear any dirty bits and
+ * don't set any writeback bits.
+ *
+ * Do set the Ordered (Private2) bit so we know this page was properly
+ * setup for writepage.
+ */
+ page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops |= PAGE_SET_ORDERED;
+
+ /*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
* extents follows a NOCOW path because relocation preallocates the
@@ -1445,8 +1463,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
@@ -1463,6 +1486,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
+ /*
+ * Locked range will be released either during error clean up or
+ * after the whole range is finished.
+ */
lock_extent(&inode->io_tree, start, start + ram_size - 1,
&cached);
@@ -1477,7 +1504,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_REGULAR);
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached);
@@ -1508,27 +1535,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /*
- * We're not doing compressed IO, don't unlock the first page
- * (which the caller expects to stay locked), don't clear any
- * dirty bits and don't set any writeback bits
- *
- * Do set the Ordered (Private2) bit so we know this page was
- * properly setup for writepage.
- */
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
-
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_folio, &cached,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- page_ops);
- if (num_bytes < cur_alloc_size)
+ if (num_bytes < ram_size)
num_bytes = 0;
else
- num_bytes -= cur_alloc_size;
+ num_bytes -= ram_size;
alloc_hint = ins.objectid + ins.offset;
- start += cur_alloc_size;
+ start += ram_size;
extent_reserved = false;
/*
@@ -1539,6 +1551,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
+ extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
*done_offset = end;
@@ -1554,40 +1568,35 @@ out_unlock:
* Now, we have three regions to clean up:
*
* |-------(1)----|---(2)---|-------------(3)----------|
- * `- orig_start `- start `- start + cur_alloc_size `- end
+ * `- orig_start `- start `- start + ram_size `- end
*
* We process each region below.
*/
- clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
/*
* For the range (1). We have already instantiated the ordered extents
* for this region. They are cleaned up by
* btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
- * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
- * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
- * function.
+ * btrfs_run_delalloc_range().
+ * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+ * are also handled by the cleanup function.
*
- * However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_folio) to ensure all the pages are unlocked.
+ * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+ * finish the writeback of the involved folios, which will be never submitted.
*/
- if (keep_locked && orig_start < start) {
+ if (orig_start < start) {
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_folio, NULL, 0, page_ops);
+ locked_folio, NULL, clear_bits, page_ops);
}
- /*
- * At this point we're unlocked, we want to make sure we're only
- * clearing these flags under the extent lock, so lock the rest of the
- * range and clear everything up.
- */
- lock_extent(&inode->io_tree, start, end, NULL);
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* For the range (2). If we reserved an extent for our delalloc range
@@ -1601,11 +1610,11 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size - 1,
+ start + ram_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
- btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
- start += cur_alloc_size;
+ btrfs_qgroup_free_data(inode, NULL, start, ram_size, NULL);
+ start += ram_size;
}
/*
@@ -2048,6 +2057,63 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode,
mapping_set_error(mapping, error);
}
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error handler,
+ * as metadata for created ordered extent will only be freed by
+ * btrfs_finish_ordered_io().
+ */
+ return ret;
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -2092,15 +2158,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2155,12 +2218,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2234,67 +2298,13 @@ must_cow:
}
}
- nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
- lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
-
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- struct extent_map *em;
-
- em = btrfs_create_io_em(inode, cur_offset,
- &nocow_args.file_extent,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- &nocow_args.file_extent,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_folio, &cached_state,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
}
btrfs_release_path(path);
@@ -4724,7 +4734,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@@ -4750,6 +4759,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out_notrans;
}
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
@@ -4759,27 +4785,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!ret) {
+ if (!ret)
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -4849,8 +4859,11 @@ again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio)) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
@@ -7986,6 +7999,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8109,6 +8123,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8166,30 +8205,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8239,6 +8271,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8373,6 +8406,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8421,7 +8477,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -8437,6 +8493,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9672,8 +9732,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED));
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3e3722a73239..1706f6d9b12e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -758,14 +758,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_record_new_subvolume(trans, BTRFS_I(dir));
-
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 4ed11b089ea9..880f9553d79d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -153,9 +153,10 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
struct btrfs_ordered_extent *entry;
int ret;
u64 qgroup_rsv = 0;
+ const bool is_nocow = (flags &
+ ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
- if (flags &
- ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
+ if (is_nocow) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
if (ret < 0)
@@ -170,8 +171,13 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
return ERR_PTR(ret);
}
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- if (!entry)
+ if (!entry) {
+ if (!is_nocow)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ btrfs_root_id(inode->root),
+ qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(-ENOMEM);
+ }
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -253,7 +259,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
* @disk_bytenr: Offset of extent on disk.
* @disk_num_bytes: Size of extent on disk.
* @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e9f58cdeeb5f..6b181bf9f156 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1373,11 +1373,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
/*
* We have nothing held here and no trans handle, just return the error
- * if there is one.
+ * if there is one and set back the quota enabled bit since we didn't
+ * actually disable quotas.
*/
ret = flush_reservations(fs_info);
- if (ret)
+ if (ret) {
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
return ret;
+ }
/*
* 1 For the root item
@@ -1489,7 +1492,6 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup *cur;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
int ret = 0;
@@ -1499,7 +1501,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
goto out;
qgroup_iterator_add(&qgroup_list, qgroup);
- list_for_each_entry(cur, &qgroup_list, iterator) {
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
@@ -1698,9 +1700,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
- return 0;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 39bec672df0c..8afadf994b8c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -200,8 +200,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash_table *x;
struct btrfs_stripe_hash *cur;
struct btrfs_stripe_hash *h;
- int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
- int i;
+ unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
if (info->stripe_hash_table)
return 0;
@@ -222,7 +221,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
h = table->table;
- for (i = 0; i < num_entries; i++) {
+ for (unsigned int i = 0; i < num_entries; i++) {
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f24a80857cd6..79eb984041dd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -687,6 +687,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
+ /*
+ * Relocation will wait for cleaner thread, and any half-dropped
+ * subvolume will be fully cleaned up at mount time.
+ * So here we shouldn't hit a subvolume with non-zero drop_progress.
+ *
+ * If this isn't the case, error out since it can make us attempt to
+ * drop references for extents that were already dropped before.
+ */
+ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+ struct btrfs_key cpu_key;
+
+ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+ btrfs_err(fs_info,
+ "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)",
+ objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset);
+ ret = -EUCLEAN;
+ goto fail;
+ }
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c73a41b1ad56..3fcc7c092c5e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -153,12 +153,14 @@ struct scrub_stripe {
unsigned int init_nr_io_errors;
unsigned int init_nr_csum_errors;
unsigned int init_nr_meta_errors;
+ unsigned int init_nr_meta_gen_errors;
/*
* The following error bitmaps are all for the current status.
* Every time we submit a new read, these bitmaps may be updated.
*
- * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+ * error_bitmap = io_error_bitmap | csum_error_bitmap |
+ * meta_error_bitmap | meta_generation_bitmap;
*
* IO and csum errors can happen for both metadata and data.
*/
@@ -166,6 +168,7 @@ struct scrub_stripe {
unsigned long io_error_bitmap;
unsigned long csum_error_bitmap;
unsigned long meta_error_bitmap;
+ unsigned long meta_gen_error_bitmap;
/* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap;
@@ -616,7 +619,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
memcpy(on_disk_csum, header->csum, fs_info->csum_size);
if (logical != btrfs_stack_header_bytenr(header)) {
- bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
@@ -672,7 +675,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
}
if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad generation, has %llu want %llu",
@@ -684,6 +687,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
}
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -972,8 +976,22 @@ skip:
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
+ if (test_bit(sector_nr, &stripe->meta_gen_error_bitmap))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("generation error", dev, false,
+ stripe->logical, physical);
}
+ /* Update the device stats. */
+ for (int i = 0; i < stripe->init_nr_io_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
+ for (int i = 0; i < stripe->init_nr_csum_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ /* Generation mismatch error is based on each metadata, not each block. */
+ for (int i = 0; i < stripe->init_nr_meta_gen_errors;
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits))
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
+
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
@@ -982,7 +1000,8 @@ skip:
sctx->stat.no_csum += nr_nodatacsum_sectors;
sctx->stat.read_errors += stripe->init_nr_io_errors;
sctx->stat.csum_errors += stripe->init_nr_csum_errors;
- sctx->stat.verify_errors += stripe->init_nr_meta_errors;
+ sctx->stat.verify_errors += stripe->init_nr_meta_errors +
+ stripe->init_nr_meta_gen_errors;
sctx->stat.uncorrectable_errors +=
bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
@@ -1028,6 +1047,8 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
stripe->nr_sectors);
stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
stripe->nr_sectors);
+ stripe->init_nr_meta_gen_errors = bitmap_weight(&stripe->meta_gen_error_bitmap,
+ stripe->nr_sectors);
if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
goto out;
@@ -1142,6 +1163,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
}
bio_put(&bbio->bio);
@@ -1508,10 +1532,12 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
stripe->init_nr_io_errors = 0;
stripe->init_nr_csum_errors = 0;
stripe->init_nr_meta_errors = 0;
+ stripe->init_nr_meta_gen_errors = 0;
stripe->error_bitmap = 0;
stripe->io_error_bitmap = 0;
stripe->csum_error_bitmap = 0;
stripe->meta_error_bitmap = 0;
+ stripe->meta_gen_error_bitmap = 0;
}
/*
@@ -1541,8 +1567,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
u64 extent_gen;
int ret;
- if (unlikely(!extent_root)) {
- btrfs_err(fs_info, "no valid extent root for scrub");
+ if (unlikely(!extent_root || !csum_root)) {
+ btrfs_err(fs_info, "no valid extent or csum root for scrub");
return -EUCLEAN;
}
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b1015f383f75..c843b4aefb8a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -487,10 +487,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
path_len = p->end - p->start;
old_buf_len = p->buf_len;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 08ccf5d5e144..6119a06b0569 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
-#include <linux/swap.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -570,6 +569,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_commit_interval:
ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) {
+ btrfs_warn(NULL, "excessive commit interval %u, use with care",
+ ctx->commit_interval);
+ }
if (ctx->commit_interval == 0)
ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
break;
@@ -2395,16 +2398,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- /*
- * We may be called from any task trying to allocate memory and we don't
- * want to slow it down with scanning and dropping extent maps. It would
- * also cause heavy lock contention if many tasks concurrently enter
- * here. Therefore only allow kswapd tasks to scan and drop extent maps.
- */
- if (!current_is_kswapd())
- return 0;
+ btrfs_free_extent_maps(fs_info, nr_to_scan);
- return btrfs_free_extent_maps(fs_info, nr_to_scan);
+ /* The extent map shrinker runs asynchronously, so always return 0. */
+ return 0;
}
static const struct super_operations btrfs_super_ops = {
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 0a2dbfaaf49e..de226209220f 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -14,9 +14,9 @@
#include "../disk-io.h"
#include "../btrfs_inode.h"
-#define PROCESS_UNLOCK (1 << 0)
-#define PROCESS_RELEASE (1 << 1)
-#define PROCESS_TEST_LOCKED (1 << 2)
+#define PROCESS_UNLOCK (1U << 0)
+#define PROCESS_RELEASE (1U << 1)
+#define PROCESS_TEST_LOCKED (1U << 2)
static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 24806e19c7c4..dbef80cd5a9f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1739,8 +1739,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_create_qgroup(trans, objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
}
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9637c7cdc0cf..31adea5b0b96 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,11 +138,14 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
-static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
struct inode *inode;
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(is_fstree(btrfs_root_id(root)));
+
/*
* We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply
@@ -154,7 +157,10 @@ static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
inode = btrfs_iget(objectid, root);
memalloc_nofs_restore(nofs_flag);
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return BTRFS_I(inode);
}
/*
@@ -318,8 +324,7 @@ struct walk_control {
/*
* Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
- * the LOG_WALK_REPLAY_INODES stage.
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
*/
bool ignore_cur_inode;
@@ -610,21 +615,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- inode = NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -650,7 +640,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long size;
int ret = 0;
@@ -674,23 +664,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ return 0;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
- }
+ inode = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
/*
* first check to see if we already have this extent in the
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
@@ -724,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
@@ -902,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
if (ret)
goto out;
update_inode:
- btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
out:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -948,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
@@ -963,9 +948,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -973,10 +959,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1087,7 +1074,9 @@ again:
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
@@ -1149,7 +1138,7 @@ again:
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
- struct inode *victim_parent;
+ struct btrfs_inode *victim_parent;
leaf = path->nodes[0];
@@ -1160,13 +1149,13 @@ again:
struct fscrypt_str victim_name;
extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
+ victim_name.len, &victim_name);
if (ret)
return ret;
@@ -1181,18 +1170,18 @@ again:
kfree(victim_name.name);
return ret;
} else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
+ victim_parent = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(victim_parent)) {
+ ret = PTR_ERR(victim_parent);
+ } else {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(victim_parent),
+ victim_parent,
inode, &victim_name);
+ iput(&victim_parent->vfs_inode);
}
- iput(victim_parent);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1326,19 +1315,18 @@ again:
ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
- struct inode *dir;
+ struct btrfs_inode *dir;
btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
goto out;
}
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
@@ -1370,8 +1358,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
@@ -1404,15 +1392,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -1420,24 +1412,44 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (log_ref_ver) {
ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret)
+ goto out;
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ }
+ goto out;
+ }
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ if (ret)
+ goto out;
}
- if (ret)
- goto out;
- ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1448,8 +1460,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir), BTRFS_I(inode),
+ ret = __add_inode_ref(trans, root, path, log, dir, inode,
inode_objectid, parent_objectid,
ref_index, &name);
if (ret) {
@@ -1459,22 +1470,22 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name, 0, ref_index);
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
if (ret)
goto out;
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
/* Else, ret == 1, we already have a perfect match, we're done. */
+next:
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
- if (log_ref_ver) {
- iput(dir);
+ if (log_ref_ver && dir) {
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1487,8 +1498,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
if (ret)
goto out;
@@ -1497,8 +1507,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
kfree(name.name);
- iput(dir);
- iput(inode);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1670,12 +1682,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
@@ -1697,14 +1710,14 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- ret = fixup_inode_link_count(trans, inode);
- iput(inode);
+ ret = fixup_inode_link_count(trans, &inode->vfs_inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
@@ -1732,12 +1745,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
{
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
@@ -1746,15 +1761,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
} else if (ret == -EEXIST) {
ret = 0;
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1770,27 +1785,26 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
- iput(inode);
- return -EIO;
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
+ iput(&inode->vfs_inode);
+ return PTR_ERR(dir);
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -1852,16 +1866,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
- struct inode *dir;
+ struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
@@ -1882,9 +1896,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -1899,9 +1912,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- index_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -1956,11 +1968,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (!ret && name_added)
ret = 1;
return ret;
@@ -2117,16 +2129,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
/*
@@ -2163,9 +2175,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -2173,9 +2186,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inc_nlink(inode);
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name);
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2185,7 +2197,8 @@ out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -2309,7 +2322,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
- struct inode *dir;
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
@@ -2317,14 +2330,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (!log_path)
return -ENOMEM;
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
+ if (IS_ERR(dir)) {
btrfs_free_path(log_path);
- return 0;
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
}
range_start = 0;
@@ -2386,7 +2402,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
- iput(dir);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2430,23 +2446,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
- wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
- u32 mode;
+ btrfs_item_key_to_cpu(eb, &key, i);
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
/*
- * If we have a tmpfile (O_TMPFILE) that got fsync'ed
- * and never got linked before the fsync, skip it, as
- * replaying it is pointless since it would be deleted
- * later. We skip logging tmpfiles, but it's always
- * possible we are replaying a log created with a kernel
- * that used to log tmpfiles.
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
@@ -2454,8 +2477,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
} else {
wc->ignore_cur_inode = false;
}
- ret = replay_xattr_deletes(wc->trans, root, log,
- path, key.objectid);
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ u32 mode;
+
+ ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
@@ -2480,30 +2509,28 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root,
- BTRFS_I(inode),
+ ret = btrfs_drop_extents(wc->trans, root, inode,
&drop_args);
if (!ret) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- BTRFS_I(inode));
+ ret = btrfs_update_inode(wc->trans, inode);
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
@@ -2538,9 +2565,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
- if (ret && ret != -ENOENT)
+ if (ret)
break;
- ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
@@ -2561,14 +2587,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
+ return -ENOENT;
}
spin_lock(&cache->space_info->lock);
@@ -2579,27 +2605,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
+
+ return 0;
}
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
- int ret;
-
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
wait_on_extent_buffer_writeback(eb);
btrfs_tree_unlock(eb);
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans, eb);
- if (ret)
- return ret;
- } else {
- unaccount_log_buffer(eb->fs_info, eb->start);
- }
+ if (trans)
+ return btrfs_pin_reserved_extent(trans, eb);
- return 0;
+ return unaccount_log_buffer(eb->fs_info, eb->start);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -4231,6 +4252,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_timespec_nsec(&token, &item->ctime,
inode_get_ctime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
/*
* We do not need to set the nbytes field, in fact during a fast fsync
* its value may not even be correct, since a fast fsync does not wait
@@ -5485,7 +5509,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
ihold(&curr_inode->vfs_inode);
while (true) {
- struct inode *vfs_inode;
struct btrfs_key key;
struct btrfs_key found_key;
u64 next_index;
@@ -5501,7 +5524,7 @@ again:
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
@@ -5528,17 +5551,16 @@ again:
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5580,14 +5602,13 @@ again:
kfree(dir_elem);
btrfs_add_delayed_iput(curr_inode);
- curr_inode = NULL;
- vfs_inode = btrfs_iget_logging(ino, root);
- if (IS_ERR(vfs_inode)) {
- ret = PTR_ERR(vfs_inode);
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
break;
}
- curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
@@ -5665,7 +5686,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -5756,12 +5777,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
return 0;
}
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5797,7 +5818,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
u64 parent;
@@ -5833,9 +5854,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* dir index key range logged for the directory. So we
* must make sure the deletion is recorded.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
continue;
@@ -5851,8 +5871,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5863,8 +5883,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
}
@@ -6356,7 +6376,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
list_for_each_entry(item, delayed_ins_list, log_list) {
struct btrfs_dir_item *dir_item;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
struct btrfs_key key;
int log_mode = LOG_INODE_EXISTS;
@@ -6372,8 +6392,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
break;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
continue;
}
@@ -6381,12 +6401,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ btrfs_add_delayed_iput(di_inode);
if (ret)
break;
@@ -6794,7 +6814,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
- struct inode *dir_inode;
+ struct btrfs_inode *dir_inode;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
@@ -6843,18 +6863,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
continue;
}
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
- LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans,
- BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -6879,7 +6897,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -6894,11 +6912,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid &&
- need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -7308,11 +7325,14 @@ again:
wc.replay_dest->log_root = log;
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
+ if (ret) {
/* The loop needs to continue due to the root refs */
btrfs_abort_transaction(trans, ret);
- else
+ } else {
ret = walk_log_tree(trans, log, &wc);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
@@ -7476,6 +7496,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
*/
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 587ac07cd194..58e0cac5779d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -732,82 +732,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}
-/*
- * We can have very weird soft links passed in.
- * One example is "/proc/self/fd/<fd>", which can be a soft link to
- * a block device.
- *
- * But it's never a good idea to use those weird names.
- * Here we check if the path (not following symlinks) is a good one inside
- * "/dev/".
- */
-static bool is_good_dev_path(const char *dev_path)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- bool is_good = false;
- int ret;
-
- if (!dev_path)
- goto out;
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf)
- goto out;
-
- /*
- * Do not follow soft link, just check if the original path is inside
- * "/dev/".
- */
- ret = kern_path(dev_path, 0, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path))
- goto out;
- if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
- goto out;
- is_good = true;
-out:
- kfree(path_buf);
- path_put(&path);
- return is_good;
-}
-
-static int get_canonical_dev_path(const char *dev_path, char *canonical)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- int ret;
-
- if (!dev_path) {
- ret = -EINVAL;
- goto out;
- }
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path)) {
- ret = PTR_ERR(resolved_path);
- goto out;
- }
- ret = strscpy(canonical, resolved_path, PATH_MAX);
-out:
- kfree(path_buf);
- path_put(&path);
- return ret;
-}
-
static bool is_same_device(struct btrfs_device *device, const char *new_path)
{
struct path old = { .mnt = NULL, .dentry = NULL };
@@ -1495,23 +1419,12 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
- char *canonical_path = NULL;
u64 bytenr;
dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
- if (!is_good_dev_path(path)) {
- canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (canonical_path) {
- ret = get_canonical_dev_path(path, canonical_path);
- if (ret < 0) {
- kfree(canonical_path);
- canonical_path = NULL;
- }
- }
- }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1556,8 +1469,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(canonical_path ? : path, disk_super,
- &new_device_added);
+ device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1566,7 +1478,6 @@ free_disk_super:
error_bdev_put:
fput(bdev_file);
- kfree(canonical_path);
return device;
}
@@ -3357,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 2603c9d60fd2..53d8c49ec058 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2456,8 +2456,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2470,7 +2470,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2524,7 +2523,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 866607fd3e58..c9ea37fabf65 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -24,7 +24,7 @@
#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
-#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */