diff options
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 288 |
1 files changed, 141 insertions, 147 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 254c2ee43aae..d90695c1ab6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -104,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that @@ -139,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); int ret = 0; + bool created = false; /* * First check if the log root tree was already created. If not, create @@ -149,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, fs_info); - if (!ret) + if (!ret) { set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } } mutex_unlock(&tree_root->log_mutex); if (ret) @@ -159,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; @@ -172,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -200,14 +225,22 @@ out: */ static int join_running_log_trans(struct btrfs_root *root) { + const bool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT; if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret; mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); @@ -2752,6 +2785,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); return ret; } + btrfs_redirty_list_add( + trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3085,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); + /* + * -EAGAIN happens when someone, e.g., a concurrent transaction + * commit, writes a dirty extent in this tree-log commit. This + * concurrent write will create a hole writing out the extents, + * and we cannot proceed on a zoned filesystem, requiring + * sequential writing. While we can bail out to a full commit + * here, but we can continue hoping the concurrent writing fills + * the hole. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) + ret = 0; if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); @@ -3127,6 +3173,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; + if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + } + mutex_unlock(&fs_info->tree_root->log_mutex); + } + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit @@ -3194,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); - if (ret) { + /* + * As described above, -EAGAIN indicates a hole in the extents. We + * cannot wait for these write outs since the waiting cause a + * deadlock. Bail out to the full commit instead. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { + btrfs_set_log_full_commit(trans); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } else if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -3285,17 +3354,22 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - if (ret) { - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(log->fs_info, ret, NULL); + if (log->node) { + ret = walk_log_tree(trans, log, &wc); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } } clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); + + if (trans && log->node) + btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } @@ -3379,7 +3453,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; int err = 0; - int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) @@ -3406,7 +3479,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; @@ -3421,46 +3493,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; } } - /* update the directory size in the log to reflect the names - * we have removed + /* + * We do not need to update the size field of the directory's inode item + * because on log replay we update the field to reflect all existing + * entries in the directory (see overwrite_item()). */ - if (bytes_del) { - struct btrfs_key key; - - key.objectid = dir_ino; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(path); - - ret = btrfs_search_slot(trans, log, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 0) { - struct btrfs_inode_item *item; - u64 i_size; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - i_size = btrfs_inode_size(path->nodes[0], item); - if (i_size > bytes_del) - i_size -= bytes_del; - else - i_size = 0; - btrfs_set_inode_size(path->nodes[0], item, i_size); - btrfs_mark_buffer_dirty(path->nodes[0]); - } else - ret = 0; - btrfs_release_path(path); - } fail: btrfs_free_path(path); out_unlock: @@ -3889,7 +3932,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_timespec_nsec(&token, &item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait + * for ordered extent completion, which is where we update nbytes, it + * only waits for writeback to complete. During log replay as we find + * file extent items and replay them, we adjust the nbytes field of the + * inode item in subvolume tree as needed (see overwrite_item()). + */ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); @@ -5290,12 +5340,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * This is for cases where logging a directory could result in losing a + * a file after replaying the log. For example, if we move a file from a + * directory A to a directory B, then fsync directory A, we have no way + * to known the file was moved from A to B, so logging just A would + * result in losing the file after a log replay. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && + inode_only == LOG_INODE_ALL && + inode->last_unlink_trans >= trans->transid) { + btrfs_set_log_full_commit(trans); + err = 1; + goto out_unlock; + } + + /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); @@ -5452,96 +5518,31 @@ out_unlock: } /* - * Check if we must fallback to a transaction commit when logging an inode. - * This must be called after logging the inode and is used only in the context - * when fsyncing an inode requires the need to log some other inode - in which - * case we can't lock the i_mutex of each other inode we need to log as that - * can lead to deadlocks with concurrent fsync against other inodes (as we can - * log inodes up or down in the hierarchy) or rename operations for example. So - * we take the log_mutex of the inode after we have logged it and then check for - * its last_unlink_trans value - this is safe because any task setting - * last_unlink_trans must take the log_mutex and it must do this before it does - * the actual unlink operation, so if we do this check before a concurrent task - * sets last_unlink_trans it means we've logged a consistent version/state of - * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task might have only updated last_unlink_trans before - * we logged the inode or it might have also done the unlink). + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. */ -static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static bool need_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { - bool ret = false; - - mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans >= trans->transid) { - /* - * Make sure any commits to the log are forced to be full - * commits. - */ - btrfs_set_log_full_commit(trans); - ret = true; - } - mutex_unlock(&inode->log_mutex); - - return ret; -} - -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, - struct dentry *parent, - struct super_block *sb) -{ - int ret = 0; - struct dentry *old_parent = NULL; - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. */ - if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation < trans->transid && - inode->last_unlink_trans < trans->transid) - goto out; - - if (!S_ISDIR(inode->vfs_inode.i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - goto out; - inode = BTRFS_I(d_inode(parent)); - } - - while (1) { - if (btrfs_must_commit_transaction(trans, inode)) { - ret = 1; - break; - } - - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - if (IS_ROOT(parent)) { - inode = BTRFS_I(d_inode(parent)); - if (btrfs_must_commit_transaction(trans, inode)) - ret = 1; - break; - } - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - inode = BTRFS_I(d_inode(parent)); + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; - } - dput(old_parent); -out: - return ret; + return true; } struct btrfs_dir_list { @@ -5671,7 +5672,7 @@ process_leaf: goto next_dir_inode; } - if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { + if (!need_log_inode(trans, BTRFS_I(di_inode))) { btrfs_add_delayed_iput(di_inode); break; } @@ -5681,9 +5682,6 @@ process_leaf: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) - ret = 1; btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; @@ -5821,13 +5819,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } + if (!need_log_inode(trans, BTRFS_I(dir_inode))) { + btrfs_add_delayed_iput(dir_inode); + continue; + } + if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) - ret = 1; if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); @@ -5872,7 +5872,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid) + if (BTRFS_I(inode)->generation >= trans->transid && + need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5926,7 +5927,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation >= trans->transid) { + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) @@ -6041,12 +6043,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct super_block *sb; int ret = 0; bool log_dentries = false; - sb = inode->vfs_inode.i_sb; - if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; @@ -6057,10 +6056,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb); - if (ret) - goto end_no_trans; - /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they @@ -6307,8 +6302,7 @@ again: * root->objectid_mutex is not acquired as log replay * could only happen during mount. */ - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); } wc.replay_dest->log_root = NULL; |