summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c458
1 files changed, 300 insertions, 158 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3c861b2a6d2..f84e3f9fad84 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1187,6 +1187,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1207,7 +1208,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1223,6 +1227,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1244,7 +1249,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_COMPRESSED);
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1264,6 +1269,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
@@ -1402,6 +1409,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
+ * We're not doing compressed IO, don't unlock the first page (which
+ * the caller expects to stay locked), don't clear any dirty bits and
+ * don't set any writeback bits.
+ *
+ * Do set the Ordered (Private2) bit so we know this page was properly
+ * setup for writepage.
+ */
+ page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops |= PAGE_SET_ORDERED;
+
+ /*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
* extents follows a NOCOW path because relocation preallocates the
@@ -1445,8 +1463,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
@@ -1463,6 +1486,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
+ /*
+ * Locked range will be released either during error clean up or
+ * after the whole range is finished.
+ */
lock_extent(&inode->io_tree, start, start + ram_size - 1,
&cached);
@@ -1477,7 +1504,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_REGULAR);
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached);
@@ -1508,27 +1535,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /*
- * We're not doing compressed IO, don't unlock the first page
- * (which the caller expects to stay locked), don't clear any
- * dirty bits and don't set any writeback bits
- *
- * Do set the Ordered (Private2) bit so we know this page was
- * properly setup for writepage.
- */
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
-
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_folio, &cached,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- page_ops);
- if (num_bytes < cur_alloc_size)
+ if (num_bytes < ram_size)
num_bytes = 0;
else
- num_bytes -= cur_alloc_size;
+ num_bytes -= ram_size;
alloc_hint = ins.objectid + ins.offset;
- start += cur_alloc_size;
+ start += ram_size;
extent_reserved = false;
/*
@@ -1539,6 +1551,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
+ extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
*done_offset = end;
@@ -1554,40 +1568,35 @@ out_unlock:
* Now, we have three regions to clean up:
*
* |-------(1)----|---(2)---|-------------(3)----------|
- * `- orig_start `- start `- start + cur_alloc_size `- end
+ * `- orig_start `- start `- start + ram_size `- end
*
* We process each region below.
*/
- clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
/*
* For the range (1). We have already instantiated the ordered extents
* for this region. They are cleaned up by
* btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
- * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
- * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
- * function.
+ * btrfs_run_delalloc_range().
+ * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+ * are also handled by the cleanup function.
*
- * However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_folio) to ensure all the pages are unlocked.
+ * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+ * finish the writeback of the involved folios, which will be never submitted.
*/
- if (keep_locked && orig_start < start) {
+ if (orig_start < start) {
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_folio, NULL, 0, page_ops);
+ locked_folio, NULL, clear_bits, page_ops);
}
- /*
- * At this point we're unlocked, we want to make sure we're only
- * clearing these flags under the extent lock, so lock the rest of the
- * range and clear everything up.
- */
- lock_extent(&inode->io_tree, start, end, NULL);
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* For the range (2). If we reserved an extent for our delalloc range
@@ -1601,11 +1610,11 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size - 1,
+ start + ram_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
- btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
- start += cur_alloc_size;
+ btrfs_qgroup_free_data(inode, NULL, start, ram_size, NULL);
+ start += ram_size;
}
/*
@@ -2002,6 +2011,110 @@ static int can_nocow_file_extent(struct btrfs_path *path,
}
/*
+ * Cleanup the dirty folios which will never be submitted due to error.
+ *
+ * When running a delalloc range, we may need to split the ranges (due to
+ * fragmentation or NOCOW). If we hit an error in the later part, we will error
+ * out and previously successfully executed range will never be submitted, thus
+ * we have to cleanup those folios by clearing their dirty flag, starting and
+ * finishing the writeback.
+ */
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 start, u64 end, int error)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ u32 len;
+
+ ASSERT(end + 1 - start < U32_MAX);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+ len = end + 1 - start;
+
+ /*
+ * Handle the locked folio first.
+ * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+ */
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+
+ for (pgoff_t index = start_index; index <= end_index; index++) {
+ struct folio *folio;
+
+ /* Already handled at the beginning. */
+ if (index == locked_folio->index)
+ continue;
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+ /* Cache already dropped, no need to do any cleanup. */
+ if (IS_ERR(folio))
+ continue;
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ mapping_set_error(mapping, error);
+}
+
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error handler,
+ * as metadata for created ordered extent will only be freed by
+ * btrfs_finish_ordered_io().
+ */
+ return ret;
+}
+
+/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
@@ -2016,6 +2129,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
+ /*
+ * If not 0, represents the inclusive end of the last fallback_to_cow()
+ * range. Only for error handling.
+ */
+ u64 cow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
@@ -2040,15 +2158,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2103,12 +2218,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2176,72 +2292,19 @@ must_cow:
found_key.offset - 1);
cow_start = (u64)-1;
if (ret) {
+ cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
}
- nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
- lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
-
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- struct extent_map *em;
-
- em = btrfs_create_io_em(inode, cur_offset,
- &nocow_args.file_extent,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- &nocow_args.file_extent,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_folio, &cached_state,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
}
btrfs_release_path(path);
@@ -2249,11 +2312,12 @@ must_cow:
cow_start = cur_offset;
if (cow_start != (u64)-1) {
- cur_offset = end;
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
cow_start = (u64)-1;
- if (ret)
+ if (ret) {
+ cow_end = end;
goto error;
+ }
}
btrfs_free_path(path);
@@ -2261,12 +2325,41 @@ must_cow:
error:
/*
+ * There are several error cases:
+ *
+ * 1) Failed without falling back to COW
+ * start cur_offset end
+ * |/////////////| |
+ *
+ * For range [start, cur_offset) the folios are already unlocked (except
+ * @locked_folio), EXTENT_DELALLOC already removed.
+ * Only need to clear the dirty flag as they will never be submitted.
+ * Ordered extent and extent maps are handled by
+ * btrfs_mark_ordered_io_finished() inside run_delalloc_range().
+ *
+ * 2) Failed with error from fallback_to_cow()
+ * start cur_offset cow_end end
+ * |/////////////|-----------| |
+ *
+ * For range [start, cur_offset) it's the same as case 1).
+ * But for range [cur_offset, cow_end), the folios have dirty flag
+ * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range().
+ *
+ * Thus we should not call extent_clear_unlock_delalloc() on range
+ * [cur_offset, cow_end), as the folios are already unlocked.
+ *
+ * So clear the folio dirty flags for [start, cur_offset) first.
+ */
+ if (cur_offset > start)
+ cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+
+ /*
* If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to cow_start to ensure the COW region is unlocked
- * as well.
+ * needs to be reset to @cow_end + 1 to skip the COW range, as
+ * cow_file_range() will do the proper cleanup at error.
*/
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
+ if (cow_end)
+ cur_offset = cow_end + 1;
/*
* We need to lock the extent here because we're clearing DELALLOC and
@@ -2336,8 +2429,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
@@ -4642,7 +4734,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@@ -4668,6 +4759,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out_notrans;
}
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
@@ -4677,27 +4785,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!ret) {
+ if (!ret)
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -4767,8 +4859,11 @@ again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio)) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
@@ -7117,8 +7212,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
ret = -EAGAIN;
goto out;
}
-
- cond_resched();
}
if (file_extent)
@@ -7906,6 +7999,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8029,6 +8123,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8086,30 +8205,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8159,6 +8271,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8293,6 +8406,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8341,7 +8477,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -8357,6 +8493,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9592,8 +9732,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED));
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -10061,6 +10201,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
if (bsi.block_len)