From ab7bb61092308e83130b8d15725aee1672991d65 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 29 Jul 2015 11:51:01 +1000 Subject: xfs: xfs_bunmapi() does not need XFS_BMAPI_METADATA flag xfs_bunmapi() doesn't care what type of extent is being freed and does not look at the XFS_BMAPI_METADATA flag at all. As such we can remove the XFS_BMAPI_METADATA from all callers that use it. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_remote.c | 5 ++--- fs/xfs/libxfs/xfs_da_btree.c | 4 ++-- fs/xfs/libxfs/xfs_dir2.c | 33 +++++++++++++++------------------ fs/xfs/xfs_symlink.c | 2 +- 4 files changed, 20 insertions(+), 24 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 20de88d1bf86..707362eab705 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -594,9 +594,8 @@ xfs_attr_rmtval_remove( xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); + XFS_BMAPI_ATTRFORK, 1, args->firstblock, + args->flist, &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2385f8cd08ab..2ae91e89f6b4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2351,8 +2351,8 @@ xfs_da_shrink_inode( * the last block to the place we want to kill. */ error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, &done); + xfs_bmapi_aflag(w), 0, args->firstblock, + args->flist, &done); if (error == -ENOSPC) { if (w != XFS_DATA_FORK) break; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index a69fb3a1e161..e0ba97610f01 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -674,25 +674,22 @@ xfs_dir2_shrink_inode( mp = dp->i_mount; tp = args->trans; da = xfs_dir2_db_to_da(args->geo, db); - /* - * Unmap the fsblock(s). - */ - if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, - XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, - &done))) { + + /* Unmap the fsblock(s). */ + error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, + args->firstblock, args->flist, &done); + if (error) { /* - * ENOSPC actually can happen if we're in a removename with - * no space reservation, and the resulting block removal - * would cause a bmap btree split or conversion from extents - * to btree. This can only happen for un-fragmented - * directory blocks, since you need to be punching out - * the middle of an extent. - * In this case we need to leave the block in the file, - * and not binval it. - * So the block has to be in a consistent empty state - * and appropriately logged. - * We don't free up the buffer, the caller can tell it - * hasn't happened since it got an error back. + * ENOSPC actually can happen if we're in a removename with no + * space reservation, and the resulting block removal would + * cause a bmap btree split or conversion from extents to btree. + * This can only happen for un-fragmented directory blocks, + * since you need to be punching out the middle of an extent. + * In this case we need to leave the block in the file, and not + * binval it. So the block has to be in a consistent empty + * state and appropriately logged. We don't free up the buffer, + * the caller can tell it hasn't happened since it got an error + * back. */ return error; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 4be27b0210af..05c44bf51b5f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -501,7 +501,7 @@ xfs_inactive_symlink_rmt( /* * Unmap the dead block(s) to the free_list. */ - error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &first_block, &free_list, &done); if (error) goto error_bmap_cancel; -- cgit v1.2.3 From 4703da7b78776140477a023c99683d3be84b7fca Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 29 Jul 2015 11:51:01 +1000 Subject: xfs: close xc_cil list_empty() races with cil commit sequence We have seen somewhat rare reports of the following assert from xlog_cil_push_background() failing during ltp tests or somewhat innocuous desktop root fs workloads (e.g., virt operations, initramfs construction): ASSERT(!list_empty(&cil->xc_cil)); The reasoning behind the assert is that the transaction has inserted items to the CIL and hit background push codepath all with cil->xc_ctx_lock held for reading. This locks out background commit from emptying the CIL, which acquires the lock for writing. Therefore, the reasoning is that the items previously inserted in the CIL should still be present. The cil->xc_ctx_lock read lock is not sufficient to protect the xc_cil list, however, due to how CIL insertion is handled. xlog_cil_insert_items() inserts and reorders the dirty transaction items to the tail of the CIL under xc_cil_lock. It uses list_move_tail() to achieve insertion and reordering in the same block of code. This function removes and reinserts an item to the tail of the list. If a transaction commits an item that was already logged and thus already resides in the CIL, and said item is the sole item on the list, the removal and reinsertion creates a temporary state where the list is actually empty. This state is not valid and thus should never be observed by concurrent transaction commit-side checks in the circumstances outlined above. We do not want to acquire the xc_cil_lock in all of these instances as it was previously removed and replaced with a separate push lock for performance reasons. Therefore, close any races with list_empty() on the insertion side by ensuring that the list is never in a transient empty state. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index abc2ccbff739..4e7649351f5a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -307,7 +307,13 @@ xlog_cil_insert_items( if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - list_move_tail(&lip->li_cil, &cil->xc_cil); + /* + * Only move the item if it isn't already at the tail. This is + * to prevent a transient list_empty() state when reinserting + * an item that is already the only item in the CIL. + */ + if (!list_is_last(&lip->li_cil, &cil->xc_cil)) + list_move_tail(&lip->li_cil, &cil->xc_cil); } /* account for space used by new iovec headers */ -- cgit v1.2.3 From 89cebc8477290b152618ffa110bbeae340d50900 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 29 Jul 2015 11:51:10 +1000 Subject: xfs: validate transaction header length on log recovery When log recovery hits a new transaction, it copies the transaction header from the expected location in the log to the in-core structure using the length from the op record header. This length is validated to ensure it doesn't exceed the length of the record, but not against the expected size of a transaction header (and thus the size of the in-core structure). If the on-disk length is corrupted, the associated memcpy() can overflow, write to unrelated memory and lead to crashes. This has been reproduced via filesystem fuzzing. The code currently handles the possibility that the transaction header is split across two op records. Neither instance accounts for corruption where the op record length might be larger than the in-core transaction header. Update both sites to detect such corruption, warn and return an error from log recovery. Also add some comments and assert that if the record is split, the copy of the second portion is less than a full header. Otherwise, this suggests the copy of the second portion could have overwritten bits from the first and thus that something could be wrong. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01dd228ca05e..493a8ef146fc 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3380,14 +3380,24 @@ xlog_recover_add_to_cont_trans( char *ptr, *old_ptr; int old_len; + /* + * If the transaction is empty, the header was split across this and the + * previous record. Copy the rest of the header. + */ if (list_empty(&trans->r_itemq)) { - /* finish copying rest of trans header */ + ASSERT(len < sizeof(struct xfs_trans_header)); + if (len > sizeof(struct xfs_trans_header)) { + xfs_warn(log->l_mp, "%s: bad header length", __func__); + return -EIO; + } + xlog_recover_add_item(&trans->r_itemq); ptr = (char *)&trans->r_theader + - sizeof(xfs_trans_header_t) - len; + sizeof(struct xfs_trans_header) - len; memcpy(ptr, dp, len); return 0; } + /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); @@ -3436,7 +3446,19 @@ xlog_recover_add_to_trans( ASSERT(0); return -EIO; } - if (len == sizeof(xfs_trans_header_t)) + + if (len > sizeof(struct xfs_trans_header)) { + xfs_warn(log->l_mp, "%s: bad header length", __func__); + ASSERT(0); + return -EIO; + } + + /* + * The transaction header can be arbitrarily split across op + * records. If we don't have the whole thing here, copy what we + * do have and handle the rest in the next record. + */ + if (len == sizeof(struct xfs_trans_header)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); return 0; -- cgit v1.2.3 From f41febd2eb5bdaa1c5685fe8a9b09276645013bc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 29 Jul 2015 11:52:04 +1000 Subject: xfs: Use consistent logging message prefixes The second and subsequent lines of multi-line logging messages are not prefixed with the same information as the first line. Separate messages with newlines into multiple calls to ensure consistent prefixing and allow easier grep use. Signed-off-by: Joe Perches Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_node.c | 3 +-- fs/xfs/libxfs/xfs_sb.c | 14 +++++++----- fs/xfs/xfs_buf.c | 5 +++-- fs/xfs/xfs_log.c | 50 +++++++++++++++++++++---------------------- fs/xfs/xfs_log_recover.c | 8 ++++--- 5 files changed, 43 insertions(+), 37 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 41b80d3d3877..d68326a0d51c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -1845,8 +1845,7 @@ xfs_dir2_node_addname_int( if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { xfs_alert(mp, - "%s: dir ino %llu needed freesp block %lld for\n" - " data block %lld, got %lld ifbno %llu lastfbno %d", +"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, (long long)dp->d_ops->db_to_fdb( args->geo, dbno), diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index df9851c46b5c..a3f4504b1f1e 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -131,10 +131,11 @@ xfs_mount_validate_sb( if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, -"Superblock has unknown compatible features (0x%x) enabled.\n" -"Using a more recent kernel is recommended.", +"Superblock has unknown compatible features (0x%x) enabled.", (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); + xfs_warn(mp, +"Using a more recent kernel is recommended."); } if (xfs_sb_has_ro_compat_feature(sbp, @@ -145,18 +146,21 @@ xfs_mount_validate_sb( XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_warn(mp, -"Attempted to mount read-only compatible filesystem read-write.\n" +"Attempted to mount read-only compatible filesystem read-write."); + xfs_warn(mp, "Filesystem can only be safely mounted read only."); + return -EINVAL; } } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, -"Superblock has unknown incompatible features (0x%x) enabled.\n" -"Filesystem can not be safely mounted by this kernel.", +"Superblock has unknown incompatible features (0x%x) enabled.", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + xfs_warn(mp, +"Filesystem can not be safely mounted by this kernel."); return -EINVAL; } } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a4b7d92e946c..d7dbd8120aaa 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1533,9 +1533,10 @@ xfs_wait_buftarg( list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { xfs_alert(btp->bt_mount, -"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" -"Please run xfs_repair to determine the extent of the problem.", +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!", (long long)bp->b_bn); + xfs_alert(btp->bt_mount, +"Please run xfs_repair to determine the extent of the problem."); } xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 08d4fe46f0fa..6b5a84a05340 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -668,9 +668,9 @@ xfs_log_mount( ASSERT(0); goto out_free_log; } + xfs_crit(mp, "Log size out of supported range."); xfs_crit(mp, -"Log size out of supported range. Continuing onwards, but if log hangs are\n" -"experienced then please report this message in the bug report."); +"Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); } /* @@ -1142,11 +1142,13 @@ xlog_space_left( * In this case we just want to return the size of the * log as the amount of space left. */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); xfs_alert(log->l_mp, - "xlog_space_left: head behind tail\n" - " tail_cycle = %d, tail_bytes = %d\n" - " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, head_cycle, head_bytes); + " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, + " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } @@ -2028,26 +2030,24 @@ xlog_print_tic_res( "SWAPEXT" }; - xfs_warn(mp, - "xlog_write: reservation summary:\n" - " trans type = %s (%u)\n" - " unit res = %d bytes\n" - " current res = %d bytes\n" - " total reg = %u bytes (o/flow = %u bytes)\n" - " ophdrs = %u (ophdr space = %u bytes)\n" - " ophdr + reg = %u bytes\n" - " num regions = %u", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + xfs_warn(mp, "xlog_write: reservation summary:"); + xfs_warn(mp, " trans type = %s (%u)", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), - ticket->t_trans_type, - ticket->t_unit_res, - ticket->t_curr_res, - ticket->t_res_arr_sum, ticket->t_res_o_flow, - ticket->t_res_num_ophdrs, ophdr_spc, - ticket->t_res_arr_sum + - ticket->t_res_o_flow + ophdr_spc, - ticket->t_res_num); + ticket->t_trans_type); + xfs_warn(mp, " unit res = %d bytes", + ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", + ticket->t_curr_res); + xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", + ticket->t_res_arr_sum, ticket->t_res_o_flow); + xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", + ticket->t_res_num_ophdrs, ophdr_spc); + xfs_warn(mp, " ophdr + reg = %u bytes", + ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); + xfs_warn(mp, " num regions = %u", + ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 493a8ef146fc..8b00730344ba 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4549,11 +4549,13 @@ xlog_recover( xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, -"Superblock has unknown incompatible log features (0x%x) enabled.\n" -"The log can not be fully and/or safely recovered by this kernel.\n" -"Please recover the log on a kernel that supports the unknown features.", +"Superblock has unknown incompatible log features (0x%x) enabled.", (log->l_mp->m_sb.sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + xfs_warn(log->l_mp, +"The log can not be fully and/or safely recovered by this kernel."); + xfs_warn(log->l_mp, +"Please recover the log on a kernel that supports the unknown features."); return -EINVAL; } -- cgit v1.2.3 From d6077aa339d6580d12bd1089231eea2940383e32 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Jul 2015 11:52:08 +1000 Subject: xfs: Remove duplicate jumps to the same label xfs_create() and xfs_create_tmpfile() have useless jumps to identical labels. Simplify them. Signed-off-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3da9f4da4f3d..d22a984d8470 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1175,11 +1175,8 @@ xfs_create( */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, resblks > 0, &ip, &committed); - if (error) { - if (error == -ENOSPC) - goto out_trans_cancel; + if (error) goto out_trans_cancel; - } /* * Now we join the directory inode to the transaction. We do not do it @@ -1318,11 +1315,8 @@ xfs_create_tmpfile( error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto out_trans_cancel; + if (error) goto out_trans_cancel; - } if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); -- cgit v1.2.3 From 1cfc4a9cf89d23727c6678170aa5949a676fc566 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 29 Jul 2015 11:52:08 +1000 Subject: libxfs: add xfs_bit.c The header side of xfs_bit.c is already in libxfs, and the sparse inode code requires the xfs_next_bit() function so pull in the xfs_bit.c file so that a sparse inode enabled libxfs compiles cleanly in userspace. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/Makefile | 2 +- fs/xfs/libxfs/xfs_bit.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_bit.c | 118 ------------------------------------------------ 3 files changed, 119 insertions(+), 119 deletions(-) create mode 100644 fs/xfs/libxfs/xfs_bit.c delete mode 100644 fs/xfs/xfs_bit.c (limited to 'fs/xfs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index df6828570e87..a096841bd06c 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_attr.o \ xfs_attr_leaf.o \ xfs_attr_remote.o \ + xfs_bit.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ @@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs-y += xfs_aops.o \ xfs_attr_inactive.o \ xfs_attr_list.o \ - xfs_bit.o \ xfs_bmap_util.o \ xfs_buf.o \ xfs_dir2_readdir.o \ diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c new file mode 100644 index 000000000000..0e8885a59646 --- /dev/null +++ b/fs/xfs/libxfs/xfs_bit.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_log_format.h" +#include "xfs_bit.h" + +/* + * XFS bit manipulation routines, used in non-realtime code. + */ + +/* + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. + */ +int +xfs_bitmap_empty(uint *map, uint size) +{ + uint i; + uint ret = 0; + + for (i = 0; i < size; i++) { + ret |= map[i]; + } + + return (ret == 0); +} + +/* + * Count the number of contiguous bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + */ +int +xfs_contig_bits(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = 0; + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + ASSERT(start_bit < size); + size -= start_bit & ~(NBWORD - 1); + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to one first offset bits prior to start */ + tmp |= (~0U >> (NBWORD-start_bit)); + if (tmp != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return result - start_bit; +found: + return result + ffz(tmp) - start_bit; +} + +/* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + * + * Size is the number of words, not bytes, in the bitmap. + */ +int xfs_next_bit(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = start_bit & ~(NBWORD - 1); + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + if (start_bit >= size) + return -1; + size -= result; + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to zero first offset bits prior to start */ + tmp &= (~0U << start_bit); + if (tmp != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return -1; +found: + return result + ffs(tmp) - 1; +} diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c deleted file mode 100644 index 0e8885a59646..000000000000 --- a/fs/xfs/xfs_bit.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_log_format.h" -#include "xfs_bit.h" - -/* - * XFS bit manipulation routines, used in non-realtime code. - */ - -/* - * Return whether bitmap is empty. - * Size is number of words in the bitmap, which is padded to word boundary - * Returns 1 for empty, 0 for non-empty. - */ -int -xfs_bitmap_empty(uint *map, uint size) -{ - uint i; - uint ret = 0; - - for (i = 0; i < size; i++) { - ret |= map[i]; - } - - return (ret == 0); -} - -/* - * Count the number of contiguous bits set in the bitmap starting with bit - * start_bit. Size is the size of the bitmap in words. - */ -int -xfs_contig_bits(uint *map, uint size, uint start_bit) -{ - uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); - uint result = 0; - uint tmp; - - size <<= BIT_TO_WORD_SHIFT; - - ASSERT(start_bit < size); - size -= start_bit & ~(NBWORD - 1); - start_bit &= (NBWORD - 1); - if (start_bit) { - tmp = *p++; - /* set to one first offset bits prior to start */ - tmp |= (~0U >> (NBWORD-start_bit)); - if (tmp != ~0U) - goto found; - result += NBWORD; - size -= NBWORD; - } - while (size) { - if ((tmp = *p++) != ~0U) - goto found; - result += NBWORD; - size -= NBWORD; - } - return result - start_bit; -found: - return result + ffz(tmp) - start_bit; -} - -/* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - * - * Size is the number of words, not bytes, in the bitmap. - */ -int xfs_next_bit(uint *map, uint size, uint start_bit) -{ - uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); - uint result = start_bit & ~(NBWORD - 1); - uint tmp; - - size <<= BIT_TO_WORD_SHIFT; - - if (start_bit >= size) - return -1; - size -= result; - start_bit &= (NBWORD - 1); - if (start_bit) { - tmp = *p++; - /* set to zero first offset bits prior to start */ - tmp &= (~0U << start_bit); - if (tmp != 0U) - goto found; - result += NBWORD; - size -= NBWORD; - } - while (size) { - if ((tmp = *p++) != 0U) - goto found; - result += NBWORD; - size -= NBWORD; - } - return -1; -found: - return result + ffs(tmp) - 1; -} -- cgit v1.2.3 From ce748eaa65f2e9392ba82726503c8d994ffd6393 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 29 Jul 2015 11:53:31 +1000 Subject: xfs: create new metadata UUID field and incompat flag This adds a new superblock field, sb_meta_uuid. If set, along with a new incompat flag, the code will use that field on a V5 filesystem to compare to metadata UUIDs, which allows us to change the user- visible UUID at will. Userspace handles the setting and clearing of the incompat flag as appropriate, as the UUID gets changed; i.e. setting the user-visible UUID back to the original UUID (as stored in the new field) will remove the incompatible feature flag. If the incompat flag is not set, this copies the user-visible UUID into into the meta_uuid slot in memory when the superblock is read from disk; the meta_uuid field is not written back to disk in this case. The remainder of this patch simply switches verifiers, initializers, etc to use the new sb_meta_uuid field. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 4 ++-- fs/xfs/libxfs/xfs_alloc_btree.c | 4 ++-- fs/xfs/libxfs/xfs_attr_leaf.c | 4 ++-- fs/xfs/libxfs/xfs_attr_remote.c | 4 ++-- fs/xfs/libxfs/xfs_bmap_btree.c | 5 +++-- fs/xfs/libxfs/xfs_btree.c | 10 ++++++---- fs/xfs/libxfs/xfs_da_btree.c | 4 ++-- fs/xfs/libxfs/xfs_dir2_block.c | 4 ++-- fs/xfs/libxfs/xfs_dir2_data.c | 4 ++-- fs/xfs/libxfs/xfs_dir2_leaf.c | 4 ++-- fs/xfs/libxfs/xfs_dir2_node.c | 4 ++-- fs/xfs/libxfs/xfs_dquot_buf.c | 4 ++-- fs/xfs/libxfs/xfs_format.h | 22 +++++++++++++++++++--- fs/xfs/libxfs/xfs_ialloc.c | 5 +++-- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- fs/xfs/libxfs/xfs_inode_buf.c | 4 ++-- fs/xfs/libxfs/xfs_sb.c | 10 ++++++++++ fs/xfs/libxfs/xfs_symlink_remote.c | 4 ++-- 18 files changed, 66 insertions(+), 36 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f9e9ffe6fb46..b7fc17ce8233 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -464,7 +464,7 @@ xfs_agfl_verify( struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); int i; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) return false; @@ -2260,7 +2260,7 @@ xfs_agf_verify( struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 59d521c09a17..90de071dd4c2 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -295,7 +295,7 @@ xfs_allocbt_verify( case cpu_to_be32(XFS_ABTB_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; @@ -313,7 +313,7 @@ xfs_allocbt_verify( case cpu_to_be32(XFS_ABTC_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e9d401ce93bb..33df52d97ec7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -262,7 +262,7 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) return false; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; @@ -1056,7 +1056,7 @@ xfs_attr3_leaf_create( hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); } else { diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 20de88d1bf86..eba0d1e91a93 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -100,7 +100,7 @@ xfs_attr3_rmt_verify( return false; if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) return false; - if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(rmt->rm_blkno) != bno) return false; @@ -217,7 +217,7 @@ xfs_attr3_rmt_hdr_set( rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); rmt->rm_offset = cpu_to_be32(offset); rmt->rm_bytes = cpu_to_be32(size); - uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid); rmt->rm_owner = cpu_to_be64(ino); rmt->rm_blkno = cpu_to_be64(bno); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 2c44c8e50782..6b0cf6546a82 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr( if (xfs_sb_version_hascrc(&mp->m_sb)) { ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); - ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, + &mp->m_sb.sb_meta_uuid)); ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); } else @@ -647,7 +648,7 @@ xfs_bmbt_verify( case cpu_to_be32(XFS_BMAP_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) return false; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c72283dd8d44..f7d7ee7a2607 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -65,7 +65,8 @@ xfs_btree_check_lblock( if (xfs_sb_version_hascrc(&mp->m_sb)) { lblock_ok = lblock_ok && - uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + uuid_equal(&block->bb_u.l.bb_uuid, + &mp->m_sb.sb_meta_uuid) && block->bb_u.l.bb_blkno == cpu_to_be64( bp ? bp->b_bn : XFS_BUF_DADDR_NULL); } @@ -115,7 +116,8 @@ xfs_btree_check_sblock( if (xfs_sb_version_hascrc(&mp->m_sb)) { sblock_ok = sblock_ok && - uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + uuid_equal(&block->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid) && block->bb_u.s.bb_blkno == cpu_to_be64( bp ? bp->b_bn : XFS_BUF_DADDR_NULL); } @@ -1000,7 +1002,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); - uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.l.bb_pad = 0; buf->bb_u.l.bb_lsn = 0; } @@ -1013,7 +1015,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); buf->bb_u.s.bb_owner = cpu_to_be32(__owner); - uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2385f8cd08ab..e9f6709a3846 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -146,7 +146,7 @@ xfs_da3_node_verify( if (ichdr.magic != XFS_DA3_NODE_MAGIC) return false; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; @@ -324,7 +324,7 @@ xfs_da3_node_create( ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(bp->b_bn); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); - uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9354e190b82e..4778d1dd511a 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -67,7 +67,7 @@ xfs_dir3_block_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -157,7 +157,7 @@ xfs_dir3_block_init( hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index de1ea16f5748..6a57fdbc63ef 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -220,7 +220,7 @@ xfs_dir3_data_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -604,7 +604,7 @@ xfs_dir3_data_init( hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 106119955400..f300240ebb8d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -160,7 +160,7 @@ xfs_dir3_leaf_verify( if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) return false; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; @@ -310,7 +310,7 @@ xfs_dir3_leaf_init( : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(bp->b_bn); leaf3->info.owner = cpu_to_be64(owner); - uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); leaf->hdr.info.magic = cpu_to_be16(type); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 41b80d3d3877..527b7337a43b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -93,7 +93,7 @@ xfs_dir3_free_verify( if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -226,7 +226,7 @@ xfs_dir3_free_get_buf( hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 6fbf2d853a54..5331b7f0460c 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -163,7 +163,7 @@ xfs_dqcheck( d->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc( if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF)) return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid)) return false; } return true; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index a0ae572051de..9590a069e556 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -100,7 +100,7 @@ typedef struct xfs_sb { xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ xfs_rtblock_t sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ + uuid_t sb_uuid; /* user-visible file system unique id */ xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ @@ -174,6 +174,7 @@ typedef struct xfs_sb { xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ + uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -190,7 +191,7 @@ typedef struct xfs_dsb { __be64 sb_dblocks; /* number of data blocks */ __be64 sb_rblocks; /* number of realtime blocks */ __be64 sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ + uuid_t sb_uuid; /* user-visible file system unique id */ __be64 sb_logstart; /* starting block of log if internal */ __be64 sb_rootino; /* root inode number */ __be64 sb_rbmino; /* bitmap inode for realtime extents */ @@ -260,6 +261,7 @@ typedef struct xfs_dsb { __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ + uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -458,9 +460,11 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ - XFS_SB_FEAT_INCOMPAT_SPINODES) + XFS_SB_FEAT_INCOMPAT_SPINODES| \ + XFS_SB_FEAT_INCOMPAT_META_UUID) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -514,6 +518,18 @@ static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); } +/* + * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID + * is stored separately from the user-visible UUID; this allows the + * user-visible UUID to be changed on V5 filesystems which have a + * filesystem UUID stamped into every piece of metadata. + */ +static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); +} + /* * end of superblock version macros */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 66efc702452a..ce63e0431f3e 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -338,7 +338,8 @@ xfs_ialloc_inode_init( if (version == 3) { free->di_ino = cpu_to_be64(ino); ino++; - uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&free->di_uuid, + &mp->m_sb.sb_meta_uuid); xfs_dinode_calc_crc(mp, free); } else if (tp) { /* just log the inode core */ @@ -2500,7 +2501,7 @@ xfs_agi_verify( struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return false; /* * Validate the magic number of the agi block. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 674ad8f760be..f39b285beb19 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -239,7 +239,7 @@ xfs_inobt_verify( case cpu_to_be32(XFS_FIBT_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 6526e7696184..268c00f4f83a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -304,7 +304,7 @@ xfs_dinode_verify( return false; if (be64_to_cpu(dip->di_ino) != ip->i_ino) return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; return true; } @@ -366,7 +366,7 @@ xfs_iread( if (xfs_sb_version_hascrc(&mp->m_sb)) { ip->i_d.di_version = 3; ip->i_d.di_ino = ip->i_ino; - uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid); } else ip->i_d.di_version = 2; return 0; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index df9851c46b5c..0f5e08fe64a2 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -398,6 +398,14 @@ __xfs_sb_from_disk( to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* + * sb_meta_uuid is only on disk if it differs from sb_uuid and the + * feature flag is set; if not set we keep it only in memory. + */ + if (xfs_sb_version_hasmetauuid(to)) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + else + uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); @@ -539,6 +547,8 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_log_incompat); to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); + if (xfs_sb_version_hasmetauuid(from)) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } } diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index e7e26bd6468f..8f8af05b3f13 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -63,7 +63,7 @@ xfs_symlink_hdr_set( dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); dsl->sl_offset = cpu_to_be32(offset); dsl->sl_bytes = cpu_to_be32(size); - uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); dsl->sl_blkno = cpu_to_be64(bp->b_bn); bp->b_ops = &xfs_symlink_buf_ops; @@ -107,7 +107,7 @@ xfs_symlink_verify( return false; if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) return false; - if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) return false; -- cgit v1.2.3 From 5e4b5386a2c29429add601c8cfb45bb10d80c490 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:50:12 +1000 Subject: xfs: disentagle EFI release from the extent count Release of the EFI either occurs based on the reference count or the extent count. The extent count used is either the count tracked in the EFI or EFD, depending on the particular situation. In either case, the count is initialized to the final value and thus always matches the current efi_next_extent value once the EFI is completely constructed. For example, the EFI extent count is increased as the extents are logged in xfs_bmap_finish() and the full free list is always completely processed. Therefore, the count is guaranteed to be complete once the EFI transaction is committed. The EFD uses the efd_nextents counter to release the EFI. This counter is initialized to the count of the EFI when the EFD is created. Thus the EFD, as currently used, has no concept of partial EFI release based on extent count. Given that the EFI extent count is always released in whole, use of the extent count for reference counting is unnecessary. Remove this level of the API and release the EFI based on the core reference count. The efi_next_extent counter remains because it is still used to track the slot to log the next extent to free. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_extfree_item.c | 21 +++++++++------------ fs/xfs/xfs_extfree_item.h | 1 + fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_trans.h | 1 - 4 files changed, 11 insertions(+), 14 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index adc8f8fdd145..6ff738fe331a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -149,7 +149,7 @@ xfs_efi_item_unpin( xfs_efi_item_free(efip); return; } - __xfs_efi_release(efip); + xfs_efi_release(efip); } /* @@ -307,18 +307,15 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) * by this efi item we can free the efi item. */ void -xfs_efi_release(xfs_efi_log_item_t *efip, - uint nextents) +xfs_efi_release( + struct xfs_efi_log_item *efip) { - ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - /* recovery needs us to drop the EFI reference, too */ - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - __xfs_efi_release(efip); - + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) __xfs_efi_release(efip); - /* efip may now have been freed, do not reference it again. */ - } + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) @@ -442,7 +439,7 @@ xfs_efd_item_committed( * EFI got unpinned and freed before the EFD got aborted. */ if (!(lip->li_flags & XFS_LI_ABORTED)) - xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); + xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ffbce32d569..399562eaf4f5 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -77,5 +77,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); +void xfs_efi_release(struct xfs_efi_log_item *); #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01dd228ca05e..578bc2d7bac8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3739,7 +3739,7 @@ xlog_recover_process_efi( * free the memory associated with it. */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - xfs_efi_release(efip, efip->efi_format.efi_nextents); + xfs_efi_release(efip); return -EIO; } } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 3b21b4e5e467..f48e839334af 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -213,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); -void xfs_efi_release(struct xfs_efi_log_item *, uint); void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efi_log_item *, xfs_fsblock_t, -- cgit v1.2.3 From d43ac29be7a174f93a3d26cc1e68668fe86b782f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:50:13 +1000 Subject: xfs: return committed status from xfs_trans_roll() Some callers need to make error handling decisions based on whether the current transaction successfully committed or not. Rename xfs_trans_roll(), add a new parameter and provide a wrapper to preserve existing callers. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_trans.c | 15 +++++++++++++-- fs/xfs/xfs_trans.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0582a27107d4..a0ab1dae9c31 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1019,9 +1019,10 @@ xfs_trans_cancel( * chunk we've been working on and get a new transaction to continue. */ int -xfs_trans_roll( +__xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_inode *dp, + int *committed) { struct xfs_trans *trans; struct xfs_trans_res tres; @@ -1052,6 +1053,7 @@ xfs_trans_roll( if (error) return error; + *committed = 1; trans = *tpp; /* @@ -1074,3 +1076,12 @@ xfs_trans_roll( xfs_trans_ijoin(trans, dp, 0); return 0; } + +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + int committed = 0; + return __xfs_trans_roll(tpp, dp, &committed); +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f48e839334af..ba1660b502a5 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -225,6 +225,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, xfs_fsblock_t, xfs_extlen_t); int xfs_trans_commit(struct xfs_trans *); +int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); -- cgit v1.2.3 From 8d99fe92fed019e203f458370129fb28b3fb5740 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:51:16 +1000 Subject: xfs: fix efi/efd error handling to avoid fs shutdown hangs Freeing an extent in XFS involves logging an EFI (extent free intention), freeing the actual extent, and logging an EFD (extent free done). The EFI object is created with a reference count of 2: one for the current transaction and one for the subsequently created EFD. Under normal circumstances, the first reference is dropped when the EFI is unpinned and the second reference is dropped when the EFD is committed to the on-disk log. In event of errors or filesystem shutdown, there are various potential cleanup scenarios depending on the state of the EFI/EFD. The cleanup scenarios are confusing and racy, as demonstrated by the following test sequence: # mount $dev $mnt # fsstress -d $mnt -n 99999 -p 16 -z -f fallocate=1 \ -f punch=1 -f creat=1 -f unlink=1 & # sleep 5 # killall -9 fsstress; wait # godown -f $mnt # umount ... in which the final umount can hang due to the AIL being pinned indefinitely by one or more EFI items. This can occur due to several conditions. For example, if the shutdown occurs after the EFI is committed to the on-disk log and the EFD committed to the CIL, but before the EFD committed to the log, the EFD iop_committed() abort handler does not drop its reference to the EFI. Alternatively, manual error injection in the xfs_bmap_finish() codepath shows that if an error occurs after the EFI transaction is committed but before the EFD is constructed and logged, the EFI is never released from the AIL. Update the EFI/EFD item handling code to use a more straightforward and reliable approach to error handling. If an error occurs after the EFI transaction is committed and before the EFD is constructed, release the EFI explicitly from xfs_bmap_finish(). If the EFI transaction is cancelled, release the EFI in the unlock handler. Once the EFD is constructed, it is responsible for releasing the EFI under any circumstances (including whether the EFI item aborts due to log I/O error). Update the EFD item handlers to release the EFI if the transaction is cancelled or aborts due to log I/O error. Finally, update xfs_bmap_finish() to log at least one EFD extent to the transaction before xfs_free_extent() errors are handled to ensure the transaction is dirty and EFD item error handling is triggered. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 84 +++++++++++++++++++++++++++-------------------- fs/xfs/xfs_extfree_item.c | 69 ++++++++++++++++++++++---------------- fs/xfs/xfs_extfree_item.h | 25 ++++++++++++-- 3 files changed, 111 insertions(+), 67 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0f34886cf726..fa65f67737c5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -67,16 +67,15 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) */ int /* error */ xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ + struct xfs_trans **tp, /* transaction pointer addr */ + struct xfs_bmap_free *flist, /* i/o: list extents to free */ + int *committed)/* xact committed or not */ { - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ + struct xfs_efd_log_item *efd; /* extent free data */ + struct xfs_efi_log_item *efi; /* extent free intention */ + int error; /* error return value */ + struct xfs_bmap_free_item *free; /* free extent item */ + struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { @@ -88,40 +87,55 @@ xfs_bmap_finish( xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = xfs_trans_roll(tp, NULL); - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) + error = __xfs_trans_roll(tp, NULL, committed); + if (error) { + /* + * If the transaction was committed, drop the EFD reference + * since we're bailing out of here. The other reference is + * dropped when the EFI hits the AIL. + * + * If the transaction was not committed, the EFI is freed by the + * EFI item unlock handler on abort. Also, we have a new + * transaction so we should return committed=1 even though we're + * returning an error. + */ + if (*committed) { + xfs_efi_release(efi); + xfs_force_shutdown((*tp)->t_mountp, + (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + } else { + *committed = 1; + } + return error; + } efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(*tp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = (*tp)->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } + + /* + * Free the extent and log the EFD to dirty the transaction + * before handling errors. This ensures that the transaction is + * aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + * + * The bmap free list is cleaned up at a higher level. + */ + error = xfs_free_extent(*tp, free->xbfi_startblock, + free->xbfi_blockcount); xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); + free->xbfi_blockcount); + if (error) + return error; + xfs_bmap_del_free(flist, NULL, free); } + return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6ff738fe331a..475b9f00ae23 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -61,9 +61,15 @@ __xfs_efi_release( if (atomic_dec_and_test(&efip->efi_refcount)) { spin_lock(&ailp->xa_lock); - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); + /* + * We don't know whether the EFI made it to the AIL. Remove it + * if so. Note that xfs_trans_ail_delete() drops the AIL lock. + */ + if (efip->efi_item.li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + else + spin_unlock(&ailp->xa_lock); xfs_efi_item_free(efip); } } @@ -128,12 +134,12 @@ xfs_efi_item_pin( } /* - * While EFIs cannot really be pinned, the unpin operation is the last place at - * which the EFI is manipulated during a transaction. If we are being asked to - * remove the EFI it's because the transaction has been cancelled and by - * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() - * to determine who gets to free the EFI. + * The unpin operation is the last place an EFI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the EFI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the EFI to either construct + * and commit the EFD or drop the EFD's reference in the event of error. Simply + * drop the log's EFI reference now that the log is done with it. */ STATIC void xfs_efi_item_unpin( @@ -141,14 +147,6 @@ xfs_efi_item_unpin( int remove) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - if (remove) { - ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); - if (lip->li_desc) - xfs_trans_del_item(lip); - xfs_efi_item_free(efip); - return; - } xfs_efi_release(efip); } @@ -167,6 +165,11 @@ xfs_efi_item_push( return XFS_ITEM_PINNED; } +/* + * The EFI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an EFD isn't going to be + * constructed and thus we free the EFI here directly. + */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -412,20 +415,27 @@ xfs_efd_item_push( return XFS_ITEM_PINNED; } +/* + * The EFD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the EFI and free the EFD. + */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) - xfs_efd_item_free(EFD_ITEM(lip)); + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_efi_release(efdp->efd_efip); + xfs_efd_item_free(efdp); + } } /* - * When the efd item is committed to disk, all we need to do - * is delete our reference to our partner efi item and then - * free ourselves. Since we're freeing ourselves we must - * return -1 to keep the transaction code from further referencing - * this item. + * When the efd item is committed to disk, all we need to do is delete our + * reference to our partner efi item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from further + * referencing this item. */ STATIC xfs_lsn_t xfs_efd_item_committed( @@ -435,13 +445,14 @@ xfs_efd_item_committed( struct xfs_efd_log_item *efdp = EFD_ITEM(lip); /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. + * Drop the EFI reference regardless of whether the EFD has been + * aborted. Once the EFD transaction is constructed, it is the sole + * responsibility of the EFD to release the EFI (even if the EFI is + * aborted due to log I/O error). */ - if (!(lip->li_flags & XFS_LI_ABORTED)) - xfs_efi_release(efdp->efd_efip); - + xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; } diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 399562eaf4f5..8fa8651705e1 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -39,9 +39,28 @@ struct kmem_zone; * "extent free done" log item described below. * * The EFI is reference counted so that it is not freed prior to both the EFI - * and EFD being committed and unpinned. This ensures that when the last - * reference goes away the EFI will always be in the AIL as it has been - * unpinned, regardless of whether the EFD is processed before or after the EFI. + * and EFD being committed and unpinned. This ensures the EFI is inserted into + * the AIL even in the event of out of order EFI/EFD processing. In other words, + * an EFI is born with two references: + * + * 1.) an EFI held reference to track EFI AIL insertion + * 2.) an EFD held reference to track EFD commit + * + * On allocation, both references are the responsibility of the caller. Once the + * EFI is added to and dirtied in a transaction, ownership of reference one + * transfers to the transaction. The reference is dropped once the EFI is + * inserted to the AIL or in the event of failure along the way (e.g., commit + * failure, log I/O error, etc.). Note that the caller remains responsible for + * the EFD reference under all circumstances to this point. The caller has no + * means to detect failure once the transaction is committed, however. + * Therefore, an EFD is required after this point, even in the event of + * unrelated failure. + * + * Once an EFD is allocated and dirtied in a transaction, reference two + * transfers to the transaction. The EFD reference is dropped once it reaches + * the unpin handler. Similar to the EFI, the reference also drops in the event + * of commit failure or log I/O errors. Note that the EFD is not inserted in the + * AIL, so at this point both the EFI and EFD are freed. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; -- cgit v1.2.3 From 6bc43af3d5f507254b8de2058ea51f6ec998ae52 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:51:43 +1000 Subject: xfs: ensure EFD trans aborts on log recovery extent free failure Log recovery attempts to free extents with leftover EFIs in the AIL after initial processing. If the extent free fails (e.g., due to unrelated fs corruption), the transaction is cancelled, though it might not be dirtied at the time. If this is the case, the EFD does not abort and thus does not release the EFI. This can lead to hangs as the EFI pins the AIL. Update xlog_recover_process_efi() to log the EFD in the transaction before xfs_free_extent() errors are handled to ensure the transaction is dirty, aborts the EFD and releases the EFI on error. Since this is a requirement for EFD processing (and consistent with xfs_bmap_finish()), update the EFD logging helper to do the extent free and unconditionally log the EFD. This encodes the required EFD logging behavior into the helper and reduces the likelihood of errors down the road. [dchinner: re-add xfs_alloc.h to xfs_log_recover.c to fix build failure.] Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 21 +++++++-------------- fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_trans.h | 7 +++---- fs/xfs/xfs_trans_extfree.c | 32 +++++++++++++++++++++++--------- 4 files changed, 36 insertions(+), 30 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fa65f67737c5..73aab0d8d25c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -112,24 +112,17 @@ xfs_bmap_finish( return error; } + /* + * Get an EFD and free each extent in the list, logging to the EFD in + * the process. The remaining bmap free list is cleaned up by the caller + * on error. + */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - /* - * Free the extent and log the EFD to dirty the transaction - * before handling errors. This ensures that the transaction is - * aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - * - * The bmap free list is cleaned up at a higher level. - */ - error = xfs_free_extent(*tp, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); + error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, + free->xbfi_blockcount); if (error) return error; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 578bc2d7bac8..434ba2cdf6e8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3752,11 +3752,11 @@ xlog_recover_process_efi( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_trans_free_extent(tp, efdp, extp->ext_start, + extp->ext_len); if (error) goto abort_error; - xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, - extp->ext_len); + } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ba1660b502a5..4643070d7cae 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -220,10 +220,9 @@ void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, struct xfs_efi_log_item *, uint); -void xfs_trans_log_efd_extent(xfs_trans_t *, - struct xfs_efd_log_item *, - xfs_fsblock_t, - xfs_extlen_t); +int xfs_trans_free_extent(struct xfs_trans *, + struct xfs_efd_log_item *, xfs_fsblock_t, + xfs_extlen_t); int xfs_trans_commit(struct xfs_trans *); int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 284397dd7990..a96ae540eb62 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -25,6 +25,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" +#include "xfs_alloc.h" /* * This routine is called to allocate an "extent free intention" @@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp, } /* - * This routine is called to indicate that the described - * extent is to be logged as having been freed. It should - * be called once for each extent freed. + * Free an extent and log it to the EFD. Note that the transaction is marked + * dirty regardless of whether the extent free succeeds or fails to support the + * EFI/EFD lifecycle rules. */ -void -xfs_trans_log_efd_extent(xfs_trans_t *tp, - xfs_efd_log_item_t *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len) +int +xfs_trans_free_extent( + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) { uint next_extent; - xfs_extent_t *extp; + struct xfs_extent *extp; + int error; + error = xfs_free_extent(tp, start_block, ext_len); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ tp->t_flags |= XFS_TRANS_DIRTY; efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; @@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, extp->ext_start = start_block; extp->ext_len = ext_len; efdp->efd_next_extent++; + + return error; } -- cgit v1.2.3 From e32a1d1fbf6eb2bdc24aa0502e827ff4d2234604 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:52:21 +1000 Subject: xfs: use EFI refcount consistently in log recovery The EFI is initialized with a reference count of 2. One for the EFI to ensure the item makes it to the AIL and one for the subsequently created EFD to release the EFI once the EFD is committed. Log recovery uses the EFI in a similar manner, but implements a hack to remove both references in one call once the EFD is handled. Update log recovery to use EFI reference counting in a manner consistent with the log. When an EFI is encountered during recovery, an EFI item is allocated and inserted to the AIL directly. Since the EFI reference is typically dropped when the EFI is unpinned and this is analogous with AIL insertion, drop the EFI reference at this point. When a corresponding EFD is encountered in the log, this indicates that the extents were freed, no processing is required and the EFI can be dropped. Update xlog_recover_efd_pass2() to simply drop the EFD reference at this point rather than open code the AIL removal and EFI free. Remaining EFIs (i.e., with no corresponding EFD) are processed in xlog_recover_finish(). An EFD transaction is allocated and the extents are freed, which transfers ownership of the EFI reference to the EFD item in the log. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_extfree_item.c | 57 +++++++++++++++++------------------------------ fs/xfs/xfs_log_recover.c | 43 ++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 57 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 475b9f00ae23..ce1d4fb39c4d 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -46,34 +46,6 @@ xfs_efi_item_free( kmem_zone_free(xfs_efi_zone, efip); } -/* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -STATIC void -__xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - struct xfs_ail *ailp = efip->efi_item.li_ailp; - - if (atomic_dec_and_test(&efip->efi_refcount)) { - spin_lock(&ailp->xa_lock); - /* - * We don't know whether the EFI made it to the AIL. Remove it - * if so. Note that xfs_trans_ail_delete() drops the AIL lock. - */ - if (efip->efi_item.li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); - else - spin_unlock(&ailp->xa_lock); - xfs_efi_item_free(efip); - } -} - /* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format @@ -304,21 +276,32 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to the given - * efi item. Each efd calls this with the number of extents that it has - * logged, and when the sum of these reaches the total number of extents logged - * by this efi item we can free the efi item. + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ void xfs_efi_release( struct xfs_efi_log_item *efip) { - /* recovery needs us to drop the EFI reference, too */ - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - __xfs_efi_release(efip); + struct xfs_ail *ailp = efip->efi_item.li_ailp; - __xfs_efi_release(efip); - /* efip may now have been freed, do not reference it again. */ + if (atomic_dec_and_test(&efip->efi_refcount)) { + spin_lock(&ailp->xa_lock); + /* + * We don't know whether the EFI made it to the AIL. Remove it + * if so. Note that xfs_trans_ail_delete() drops the AIL lock. + */ + if (efip->efi_item.li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + else + spin_unlock(&ailp->xa_lock); + + xfs_efi_item_free(efip); + } } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 434ba2cdf6e8..05c0cc83f9a4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2928,16 +2928,16 @@ xlog_recover_efi_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; - xfs_mount_t *mp = log->l_mp; - xfs_efi_log_item_t *efip; - xfs_efi_log_format_t *efi_formatp; + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; efi_formatp = item->ri_buf[0].i_addr; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), - &(efip->efi_format)))) { + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { xfs_efi_item_free(efip); return error; } @@ -2945,20 +2945,23 @@ xlog_recover_efi_pass2( spin_lock(&log->l_ailp->xa_lock); /* - * xfs_trans_ail_update() drops the AIL lock. + * The EFI has two references. One for the EFD and one for EFI to ensure + * it makes it into the AIL. Insert the EFI into the AIL directly and + * drop the EFI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. */ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); return 0; } /* - * This routine is called when an efd format structure is found in - * a committed transaction in the log. It's purpose is to cancel - * the corresponding efi if it was still in the log. To do this - * it searches the AIL for the efi with an id equal to that in the - * efd format structure. If we find it, we remove the efi from the - * AIL and free it. + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. */ STATIC int xlog_recover_efd_pass2( @@ -2980,8 +2983,8 @@ xlog_recover_efd_pass2( efi_id = efd_formatp->efd_efi_id; /* - * Search for the efi with the id in the efd format structure - * in the AIL. + * Search for the EFI with the id in the EFD format structure in the + * AIL. */ spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); @@ -2990,18 +2993,18 @@ xlog_recover_efd_pass2( efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* - * xfs_trans_ail_delete() drops the - * AIL lock. + * Drop the EFD reference to the EFI. This + * removes the EFI from the AIL and frees it. */ - xfs_trans_ail_delete(ailp, lip, - SHUTDOWN_CORRUPT_INCORE); - xfs_efi_item_free(efip); + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); spin_lock(&ailp->xa_lock); break; } } lip = xfs_trans_ail_cursor_next(ailp, &cur); } + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); -- cgit v1.2.3 From f0b2efad16e78623b5a156f6e4e9166907b83155 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:58:36 +1000 Subject: xfs: don't leave EFIs on AIL on mount failure Log recovery occurs in two phases at mount time. In the first phase, EFIs and EFDs are processed and potentially cancelled out. EFIs without EFD objects are inserted into the AIL for processing and recovery in the second phase. xfs_mountfs() runs various other operations between the phases and is thus subject to failure. If failure occurs after the first phase but before the second, pending EFIs sit on the AIL, pin it and cause the mount to hang. Update the mount sequence to ensure that pending EFIs are cancelled in the event of failure. Add a recovery cancellation mechanism to iterate the AIL and cancel all EFI items when requested. Plumb cancellation support through the log mount finish helper and update xfs_mountfs() to invoke cancellation in the event of failure after recovery has started. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 30 ++++++++++++++++++----- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_log_priv.h | 2 ++ fs/xfs/xfs_log_recover.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--- fs/xfs/xfs_mount.c | 26 +++++++++++--------- 5 files changed, 100 insertions(+), 22 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 08d4fe46f0fa..7ce278d66577 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -700,6 +700,7 @@ xfs_log_mount( if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); + xlog_recover_cancel(mp->m_log); goto out_destroy_ail; } } @@ -740,18 +741,35 @@ out: * it. */ int -xfs_log_mount_finish(xfs_mount_t *mp) +xfs_log_mount_finish( + struct xfs_mount *mp) { int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - error = xlog_recover_finish(mp->m_log); - if (!error) - xfs_log_work_queue(mp); - } else { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + return 0; } + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + + return error; +} + +/* + * The mount has failed. Cancel the recovery if it hasn't completed and destroy + * the log. + */ +int +xfs_log_mount_cancel( + struct xfs_mount *mp) +{ + int error; + + error = xlog_recover_cancel(mp->m_log); + xfs_log_unmount(mp); return error; } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index fa27aaec72cb..09d91d3166cd 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -147,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); +int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1c87c8abfbed..950f3f94720c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -426,6 +426,8 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); +extern int +xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 05c0cc83f9a4..fd1ae47de511 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3791,10 +3791,10 @@ abort_error: */ STATIC int xlog_recover_process_efis( - struct xlog *log) + struct xlog *log) { - xfs_log_item_t *lip; - xfs_efi_log_item_t *efip; + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -3818,7 +3818,7 @@ xlog_recover_process_efis( /* * Skip EFIs that we've already processed. */ - efip = (xfs_efi_log_item_t *)lip; + efip = container_of(lip, struct xfs_efi_log_item, efi_item); if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; @@ -3837,6 +3837,50 @@ out: return error; } +/* + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending EFIs so they don't pin the AIL. + */ +STATIC int +xlog_recover_cancel_efis( + struct xlog *log) +{ + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); + spin_lock(&ailp->xa_lock); + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; +} + /* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. @@ -4610,6 +4654,17 @@ xlog_recover_finish( return 0; } +int +xlog_recover_cancel( + struct xlog *log) +{ + int error = 0; + + if (log->l_flags & XLOG_RECOVERY_NEEDED) + error = xlog_recover_cancel_efis(log); + + return error; +} #if defined(DEBUG) /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 461e791efad7..4825a8a0a506 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -615,14 +615,14 @@ xfs_default_resblks(xfs_mount_t *mp) */ int xfs_mountfs( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); - xfs_inode_t *rip; - __uint64_t resblks; - uint quotamount = 0; - uint quotaflags = 0; - int error = 0; + struct xfs_sb *sbp = &(mp->m_sb); + struct xfs_inode *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; xfs_sb_mount_common(mp, sbp); @@ -799,7 +799,9 @@ xfs_mountfs( } /* - * log's mount-time initialization. Perform 1st part recovery if needed + * Log's mount-time initialization. The first part of recovery can place + * some items on the AIL, to be handled when recovery is finished or + * cancelled. */ error = xfs_log_mount(mp, mp->m_logdev_targp, XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), @@ -910,9 +912,9 @@ xfs_mountfs( } /* - * Finish recovering the file system. This part needed to be - * delayed until after the root and real-time bitmap inodes - * were consistently read in. + * Finish recovering the file system. This part needed to be delayed + * until after the root and real-time bitmap inodes were consistently + * read in. */ error = xfs_log_mount_finish(mp); if (error) { @@ -956,7 +958,7 @@ xfs_mountfs( out_rele_rip: IRELE(rip); out_log_dealloc: - xfs_log_unmount(mp); + xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_wait_buftarg(mp->m_logdev_targp); -- cgit v1.2.3 From 78d57e4593bf700e1a4447e3a7769da8dd0e0844 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:58:48 +1000 Subject: xfs: icreate log item recovery and cancellation tracepoints Various log items have recovery tracepoints to identify whether a particular log item is recovered or cancelled. Add the equivalent tracepoints for the icreate transaction. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 5 ++++- fs/xfs/xfs_trace.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index fd1ae47de511..0c6641b39e2a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3100,9 +3100,12 @@ xlog_recover_do_icreate_pass2( * done easily. */ if (xlog_check_buffer_cancelled(log, - XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) { + trace_xfs_log_recover_icreate_cancel(log, icl); return 0; + } + trace_xfs_log_recover_icreate_recover(log, icl); xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, be32_to_cpu(icl->icl_gen)); return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8d916d33d93d..9aeeb21bc3d0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2089,6 +2089,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class, + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, count) + __field(unsigned int, isize) + __field(xfs_agblock_t, length) + __field(unsigned int, gen) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->agno = be32_to_cpu(in_f->icl_ag); + __entry->agbno = be32_to_cpu(in_f->icl_agbno); + __entry->count = be32_to_cpu(in_f->icl_count); + __entry->isize = be32_to_cpu(in_f->icl_isize); + __entry->length = be32_to_cpu(in_f->icl_length); + __entry->gen = be32_to_cpu(in_f->icl_gen); + ), + TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u " + "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agbno, __entry->count, __entry->isize, + __entry->length, __entry->gen) +) +#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel); +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover); + DECLARE_EVENT_CLASS(xfs_discard_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), -- cgit v1.2.3 From fc0d1656964fc53fca84549df5a6bd4a16a29cdf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:59:38 +1000 Subject: xfs: fix broken icreate log item cancellation Inode cluster buffers are invalidated and cancelled when inode chunks are freed to notify log recovery that previous logged updates to the metadata buffer should be skipped. This ensures that log recovery does not overwrite buffers that might have already been reused. On v4 filesystems, inode chunk allocation and inode updates are logged via the cluster buffers and thus cancellation is easily detected via buffer cancellation items. v5 filesystems use the new icreate transaction, which uses logical logging and ordered buffers to log a full inode chunk allocation at once. The resulting icreate item often spans multiple inode cluster buffers. Log recovery checks for cancelled buffers when processing icreate log items, but it has a couple problems. First, it uses the full length of the inode chunk rather than the cluster size. Second, it uses the length in FSB units rather than BB units. Either of these problems prevent icreate recovery from identifying cancelled buffers and thus inode initialization proceeds unconditionally. Update xlog_recover_do_icreate_pass2() to iterate the icreate range in cluster sized increments and check each increment for cancellation. Since icreate is currently only used for the minimum atomic inode chunk allocation, we expect that either all or none of the buffers will be cancelled. Cancel the icreate if at least one buffer is cancelled to avoid making a bad situation worse by initializing a partial inode chunk, but detect such anomalies and warn the user. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 49 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0c6641b39e2a..2fa55e1c2b73 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3032,6 +3032,11 @@ xlog_recover_do_icreate_pass2( unsigned int count; unsigned int isize; xfs_agblock_t length; + int blks_per_cluster; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { @@ -3090,25 +3095,45 @@ xlog_recover_do_icreate_pass2( } /* - * Inode buffers can be freed. Do not replay the inode initialisation as - * we could be overwriting something written after this inode buffer was - * cancelled. + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); + nbufs = length / blks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * blks_per_cluster); + if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) + cancel_count++; + } + + /* + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. * - * XXX: we need to iterate all buffers and only init those that are not - * cancelled. I think that a more fine grained factoring of - * xfs_ialloc_inode_init may be appropriate here to enable this to be - * done easily. + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. */ - if (xlog_check_buffer_cancelled(log, - XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) { + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); trace_xfs_log_recover_icreate_cancel(log, icl); return 0; } trace_xfs_log_recover_icreate_recover(log, icl); - xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); - return 0; + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); } STATIC void -- cgit v1.2.3 From a3f20014659a1566a4e516e2bf95287960fe2c44 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:59:50 +1000 Subject: xfs: checksum log record ext headers based on record size The first 4 bytes of every basic block in the physical log is stamped with the current lsn. To support this mechanism, the log record header (first block of each new log record) contains space for the original first byte of each log record block before it is replaced with the lsn. The log record header has space for 32k worth of blocks. The version 2 log adds new extended record headers for each additional 32k worth of blocks beyond what is supported by the record header. The log record checksum incorporates the log record header, the extended headers and the record payload. xlog_cksum() checksums the extended headers based on log->l_iclog_heads, which specifies the number of extended headers in a log record based on the log buffer size mount option. The log buffer size is variable, however, and thus means the checksum can be calculated differently based on how a filesystem is mounted. This is problematic if a filesystem crashes and recovery occurs on a subsequent mount using a different log buffer size. For example, crash an active filesystem that is mounted with the default (32k) logbsize, attempt remount/recovery using '-o logbsize=64k' and the mount fails on or warns about log checksum failures. To avoid this problem, update xlog_cksum() to calculate the checksum based on the size of the log buffer according to the log record. The size is already included in the h_size field of the log record header and thus is available at log recovery time. Extended log record headers are also only written when the log record is large enough to require them. This makes checksum calculation of log records consistent with the extended record header mechanism as well as how on-disk records are checksummed with various log buffer size mount options. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7ce278d66577..a98daa68045d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1670,8 +1670,13 @@ xlog_cksum( if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; + int xheads; - for (i = 1; i < log->l_iclog_heads; i++) { + xheads = size / XLOG_HEADER_CYCLE_SIZE; + if (size % XLOG_HEADER_CYCLE_SIZE) + xheads++; + + for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, sizeof(struct xlog_rec_ext_header)); } -- cgit v1.2.3 From 0ae120f8a81a8dc4f974d0819c97b58c4fa935ac Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:00:28 +1000 Subject: xfs: clean up root inode properly on mount failure The root inode is read as part of the xfs_mountfs() sequence and the reference is dropped in the event of failure after we grab the inode. The reference drop doesn't necessarily free the inode, however. It marks it for reclaim and potentially kicks off the reclaim workqueue. The workqueue is destroyed further up the error path, which means we are subject to crash if the workqueue job runs after this point or a memory leak which is identified if the xfs_inode_zone is destroyed (e.g., on module removal). Both of these outcomes are reproducible via manual instrumentation of a mount error after the root inode xfs_iget() call in xfs_mountfs(). Update the xfs_mountfs() error path to cancel any potential reclaim work items and to run a synchronous inode reclaim if the root inode is marked for reclaim. This ensures that no jobs remain on the queue before it is destroyed and that the root inode is freed before the reclaim mechanism is torn down. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4825a8a0a506..bf92e0c037c7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -957,6 +957,8 @@ xfs_mountfs( xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: xfs_log_mount_cancel(mp); out_fail_wait: -- cgit v1.2.3 From f307080a626569f89bc8fbad9f936b307aded877 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:00:53 +1000 Subject: xfs: fix btree cursor error cleanups The btree cursor cleanup function takes an error parameter that affects how buffers are released from the cursor. All buffers are released in the event of error. Several callers do not specify the XFS_BTREE_ERROR flag in the event of error, however. This can cause buffers to hang around locked or with an elevated hold count and thus lead to umount hangs in the event of errors. Fix up the xfs_btree_del_cursor() callers to pass XFS_BTREE_ERROR if the cursor is being torn down due to error. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_itable.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 66efc702452a..0b29918291ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2232,7 +2232,7 @@ xfs_imap_lookup( } xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); if (error) return error; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f41b0c3fddab..930ebd86beba 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -473,7 +473,8 @@ xfs_bulkstat( * pending error, then we are done. */ del_cursor: - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? + XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_buf_relse(agbp); if (error) break; -- cgit v1.2.3 From 146e54b71ea4b998d65c25964807ff6792bbf436 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:01:08 +1000 Subject: xfs: add helper to conditionally remove items from the AIL Several areas of code duplicate a pattern where we take the AIL lock, check whether an item is in the AIL and remove it if so. Create a new helper for this pattern and use it where appropriate. Signed-off-by: Brian Foster --- fs/xfs/xfs_buf_item.c | 6 +----- fs/xfs/xfs_dquot.c | 8 ++------ fs/xfs/xfs_extfree_item.c | 14 +------------- fs/xfs/xfs_inode_item.c | 11 ++--------- fs/xfs/xfs_trans_priv.h | 15 +++++++++++++++ 5 files changed, 21 insertions(+), 33 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 092d652bc03d..919057e0a45b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -647,11 +647,7 @@ xfs_buf_item_unlock( xfs_buf_item_relse(bp); else if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&lip->li_ailp->xa_lock); - xfs_trans_ail_delete(lip->li_ailp, lip, - SHUTDOWN_LOG_IO_ERROR); - } + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); } } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 4143dc75dca4..6964d7ceba96 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -954,12 +954,8 @@ xfs_qm_dqflush( struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - spin_lock(&mp->m_ail->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(mp->m_ail, lip, - SHUTDOWN_CORRUPT_INCORE); - else - spin_unlock(&mp->m_ail->xa_lock); + xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); + error = -EIO; goto out_unlock; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index ce1d4fb39c4d..4aa0153214f9 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -286,20 +286,8 @@ void xfs_efi_release( struct xfs_efi_log_item *efip) { - struct xfs_ail *ailp = efip->efi_item.li_ailp; - if (atomic_dec_and_test(&efip->efi_refcount)) { - spin_lock(&ailp->xa_lock); - /* - * We don't know whether the EFI made it to the AIL. Remove it - * if so. Note that xfs_trans_ail_delete() drops the AIL lock. - */ - if (efip->efi_item.li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); - else - spin_unlock(&ailp->xa_lock); - + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bf13a5a7e2f4..62bd80f4edd9 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -703,17 +703,10 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &iip->ili_item, - stale ? - SHUTDOWN_LOG_IO_ERROR : + xfs_trans_ail_remove(&iip->ili_item, + stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); - } else - spin_unlock(&ailp->xa_lock); } iip->ili_logged = 0; /* diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 1b736294558a..49931b72da8a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -119,6 +119,21 @@ xfs_trans_ail_delete( xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->xa_lock); +} + void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); void xfs_ail_push_all_sync(struct xfs_ail *); -- cgit v1.2.3 From d4a97a04227d5ba91b91888a016e2300861cfbc7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:01:40 +1000 Subject: xfs: add missing bmap cancel calls in error paths If a failure occurs after the bmap free list is populated and before xfs_bmap_finish() completes successfully (which returns a partial list on failure), the bmap free list must be cancelled. Otherwise, the extent items on the list are never freed and a memory leak occurs. Several random error paths throughout the code suffer this problem. Fix these up such that xfs_bmap_cancel() is always called on error. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/xfs_bmap_util.c | 10 +++++---- fs/xfs/xfs_inode.c | 9 ++++---- fs/xfs/xfs_rtalloc.c | 57 ++++++++++++++++++++++++------------------------ 4 files changed, 41 insertions(+), 36 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 63e05b663380..8e2010d53b07 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5945,6 +5945,7 @@ xfs_bmap_split_extent( return xfs_trans_commit(tp); out: + xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73aab0d8d25c..3bf4ad0d19e4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1474,7 +1474,7 @@ xfs_shift_file_space( XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1488,18 +1488,20 @@ xfs_shift_file_space( &done, stop_fsb, &first_block, &free_list, direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) - goto out; + goto out_bmap_cancel; error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto out; + goto out_bmap_cancel; error = xfs_trans_commit(tp); } return error; -out: +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3da9f4da4f3d..cee2f69d2469 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1791,14 +1791,15 @@ xfs_inactive_ifree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. + * Just ignore errors at this point. There is nothing we can do except + * to try to keep going. Make sure it's not a silent error. */ error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) + if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); + xfs_bmap_cancel(&free_list); + } error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f4e8c06eee26..ab1bac6a3a1c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -757,31 +757,30 @@ xfs_rtallocate_extent_size( /* * Allocate space to the bitmap or summary file, and zero it, for growfs. */ -STATIC int /* error */ +STATIC int xfs_growfs_rt_alloc( - xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_inode_t *ip) /* inode (bitmap/summary) */ + struct xfs_mount *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + struct xfs_inode *ip) /* inode (bitmap/summary) */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ + xfs_fileoff_t bno; /* block number in file */ + struct xfs_buf *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock;/* first block allocated in xaction */ + struct xfs_bmap_free flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + struct xfs_bmbt_irec map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + struct xfs_trans *tp; /* * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - xfs_trans_t *tp; - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* @@ -790,7 +789,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, resblks, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the inode. */ @@ -808,16 +807,16 @@ xfs_growfs_rt_alloc( if (!error && nmap < 1) error = -ENOSPC; if (error) - goto error_cancel; + goto out_bmap_cancel; /* * Free any blocks freed up in the transaction, then commit. */ error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error_cancel; + goto out_bmap_cancel; error = xfs_trans_commit(tp); if (error) - goto error; + return error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. @@ -832,7 +831,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, 0, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the bitmap inode. */ @@ -846,9 +845,7 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = -EIO; -error_cancel: - xfs_trans_cancel(tp); - goto error; + goto out_trans_cancel; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); @@ -857,16 +854,20 @@ error_cancel: */ error = xfs_trans_commit(tp); if (error) - goto error; + return error; } /* * Go on to the next extent, if any. */ oblocks = map.br_startoff + map.br_blockcount; } + return 0; -error: +out_bmap_cancel: + xfs_bmap_cancel(&flist); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } -- cgit v1.2.3 From c400ee3ed1b13d45adde68e12254dc6ab6977b59 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 19 Aug 2015 10:30:48 +1000 Subject: xfs: set XFS_DA_OP_OKNOENT in xfs_attr_get It's entirely possible for userspace to ask for an xattr which does not exist. Normally, there is no problem whatsoever when we ask for such a thing, but when we look at an obfuscated metadump image on a debug kernel with selinux, we trip over this ASSERT in xfs_da3_path_shift(): *result = -ENOENT; /* we're out of our tree */ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); It (more or less) only shows up in the above scenario, because xfs_metadump obfuscates attr names, but chooses names which keep the same hash value - and xfs_da3_node_lookup_int does: if (((retval == -ENOENT) || (retval == -ENOATTR)) && (blk->hashval == args->hashval)) { error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); IOWS, we only get down to the xfs_da3_path_shift() ASSERT if we are looking for an xattr which doesn't exist, but we find xattrs on disk which have the same hash, and so might be a hash collision, so we try the path shift. When *that* fails to find what we're looking for, we hit the assert about XFS_DA_OP_OKNOENT. Simply setting XFS_DA_OP_OKNOENT in xfs_attr_get solves this rather corner-case problem with no ill side effects. It's fine for an attr name lookup to fail. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 3349c9a1e845..ff065578969f 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -139,6 +139,8 @@ xfs_attr_get( args.value = value; args.valuelen = *valuelenp; + /* Entirely possible to look up a name which doesn't exist */ + args.op_flags = XFS_DA_OP_OKNOENT; lock_mode = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_hasattr(ip)) -- cgit v1.2.3 From bbf155add09e1f0179eca89e0999cf28d335ca0a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 19 Aug 2015 10:31:18 +1000 Subject: xfs: fix sb_meta_uuid usage After changing the UUID on a v5 filesystem, xfstests fails immediately on a debug kernel with: XFS: Assertion failed: uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid), file: fs/xfs/xfs_inode.c, line: 799 This needs to check against the sb_meta_uuid, not the user visible UUID that was changed. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3da9f4da4f3d..3e343c7f347a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -787,7 +787,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { ASSERT(ip->i_d.di_ino == ino); - ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid)); ip->i_d.di_crc = 0; ip->i_d.di_changecount = 1; ip->i_d.di_lsn = 0; -- cgit v1.2.3 From ac383de20d468a75e7023caf06dcf6606cd85220 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 19 Aug 2015 10:31:41 +1000 Subject: xfs: growfs not aware of sb_meta_uuid Adding this simple change to xfstests:common/rc::_scratch_mkfs_xfs: + if [ $mkfs_status -eq 0 ]; then + xfs_admin -U generate $SCRATCH_DEV > /dev/null + fi triggers all sorts of errors in xfstests. xfs/104 is an example, where growfs fails with a UUID mismatch corruption detected by xfs_agf_write_verify() when trying to write the first new AG headers. Fix this problem by making sure we copy the sb_meta_uuid into new metadata written by growfs. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9b3438a7680f..ee3aaa0a5317 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -250,7 +250,7 @@ xfs_growfs_data_private( agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -273,7 +273,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(agno); - uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); @@ -309,7 +309,7 @@ xfs_growfs_data_private( agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_sb_version_hasfinobt(&mp->m_sb)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); -- cgit v1.2.3 From fcfbe2c4ef4243cc11a1cd64ee1b4907b6afea06 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 19 Aug 2015 10:31:54 +1000 Subject: xfs: log recovery needs to validate against sb_meta_uuid Now that sb_uuid can be changed by the user, we cannot use this to validate the metadata blocks being recovered belong to this filesystem. We must check against the sb_meta_uuid as that will remain unchanged. There is a complication in this code - the superblock itself. We can not check the sb_meta_uuid unconditionally, as that may not be set on disk. Hence we must verify the superblock sb_uuid matches between the log record and the in-core superblock. Found by inspection after the previous two problems were found. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01dd228ca05e..86c3de477a9d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1890,15 +1890,25 @@ xlog_recover_get_buf_lsn( uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; break; case XFS_SB_MAGIC: + /* + * superblock uuids are magic. We may or may not have a + * sb_meta_uuid on disk, but it will be set in the in-core + * superblock. We set the uuid pointer for verification + * according to the superblock feature mask to ensure we check + * the relevant UUID in the superblock. + */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - uuid = &((struct xfs_dsb *)blk)->sb_uuid; + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; + else + uuid = &((struct xfs_dsb *)blk)->sb_uuid; break; default: break; } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } -- cgit v1.2.3 From 928634514bc53f66631a731bf623157c913b145e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 19 Aug 2015 10:32:01 +1000 Subject: xfs: dquots should be stamped with sb_meta_uuid Once the sb_uuid is changed, the wrong uuid is stamped into new dquots on disk. Found by inspection, verified by generic/219. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_dquot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 4143dc75dca4..b1b26b6a0735 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } -- cgit v1.2.3 From 1b867d3ab562b6b03e46113fad3e87b05fbfbb85 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:32:14 +1000 Subject: xfs: relocate sparse inode mount warning The sparse inodes feature is currently considered experimental. We warn at mount time from xfs_mount_validate_sb(). This function is part of the superblock verifier codepath, however, which means it could be invoked repeatedly on superblock reads or writes. This is currently only noticeable from userspace, where mkfs produces multiple warnings at format time. As mkfs warnings were not the intent of this change, relocate the mount time warning to xfs_fs_fill_super(), which is only invoked once and only in kernel space. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 3 --- fs/xfs/xfs_super.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 0f5e08fe64a2..9c87f66ab929 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -182,9 +182,6 @@ xfs_mount_validate_sb( if (xfs_sb_version_hassparseinodes(sbp)) { uint32_t align; - xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); - align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize >> sbp->sb_blocklog; if (sbp->sb_inoalignmt != align) { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1fb16562c159..f98ce83b7bc4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1528,6 +1528,10 @@ xfs_fs_fill_super( } } + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; -- cgit v1.2.3 From 7df1c170b9a45ab3a7401c79bbefa9939bf8eafb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:32:33 +1000 Subject: xfs: swap leaf buffer into path struct atomically during path shift The node directory lookup code uses a state structure that tracks the path of buffers used to search for the hash of a filename through the leaf blocks. When the lookup encounters a block that ends with the requested hash, but the entry has not yet been found, it must shift over to the next block and continue looking for the entry (i.e., duplicate hashes could continue over into the next block). This shift mechanism involves walking back up and down the state structure, replacing buffers at the appropriate btree levels as necessary. When a buffer is replaced, the old buffer is released and the new buffer read into the active slot in the path structure. Because the buffer is read directly into the path slot, a buffer read failure can result in setting a NULL buffer pointer in an active slot. This throws off the state cleanup code in xfs_dir2_node_lookup(), which expects to release a buffer from each active slot. Instead, a BUG occurs due to a NULL pointer dereference: BUG: unable to handle kernel NULL pointer dereference at 00000000000001e8 IP: [] xfs_trans_brelse+0x2a3/0x3c0 [xfs] ... RIP: 0010:[] [] xfs_trans_brelse+0x2a3/0x3c0 [xfs] ... Call Trace: [] xfs_dir2_node_lookup+0xa6/0x2c0 [xfs] [] xfs_dir_lookup+0x1ac/0x1c0 [xfs] [] xfs_lookup+0x91/0x290 [xfs] [] xfs_vn_lookup+0x73/0xb0 [xfs] [] lookup_real+0x1d/0x50 [] path_openat+0x91e/0x1490 [] do_filp_open+0x89/0x100 ... This has been reproduced via a parallel fsstress and filesystem shutdown workload in a loop. The shutdown triggers the read error in the aforementioned codepath and causes the BUG in xfs_dir2_node_lookup(). Update xfs_da3_path_shift() to update the active path slot atomically with respect to the caller when a buffer is replaced. This ensures that the caller always sees the old or new buffer in the slot and prevents the NULL pointer dereference. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_btree.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e9f6709a3846..f6a8631dc771 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1822,6 +1822,7 @@ xfs_da3_path_shift( struct xfs_da_args *args; struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; + struct xfs_buf *bp; xfs_dablk_t blkno = 0; int level; int error; @@ -1866,20 +1867,24 @@ xfs_da3_path_shift( */ for (blk++, level++; level < path->active; blk++, level++) { /* - * Release the old block. - * (if it's dirty, trans won't actually let go) + * Read the next child block into a local buffer. */ - if (release) - xfs_trans_brelse(args->trans, blk->bp); + error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp, + args->whichfork); + if (error) + return error; /* - * Read the next child block. + * Release the old block (if it's dirty, the trans doesn't + * actually let go) and swap the local buffer into the path + * structure. This ensures failure of the above read doesn't set + * a NULL buffer in an active slot in the path. */ + if (release) + xfs_trans_brelse(args->trans, blk->bp); blk->blkno = blkno; - error = xfs_da3_node_read(args->trans, dp, blkno, -1, - &blk->bp, args->whichfork); - if (error) - return error; + blk->bp = bp; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || -- cgit v1.2.3 From 0952c8183c1575a78dc416b5e168987ff98728bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 19 Aug 2015 10:32:49 +1000 Subject: xfs: clean up inode lockdep annotations Lockdep annotations are a maintenance nightmare. Locking has to be modified to suit the limitations of the annotations, and we're always having to fix the annotations because they are unable to express the complexity of locking heirarchies correctly. So, next up, we've got more issues with lockdep annotations for inode locking w.r.t. XFS_LOCK_PARENT: - lockdep classes are exclusive and can't be ORed together to form new classes. - IOLOCK needs multiple PARENT subclasses to express the changes needed for the readdir locking rework needed to stop the endless flow of lockdep false positives involving readdir calling filldir under the ILOCK. - there are only 8 unique lockdep subclasses available, so we can't create a generic solution. IOWs we need to treat the 3-bit space available to each lock type differently: - IOLOCK uses xfs_lock_two_inodes(), so needs: - at least 2 IOLOCK subclasses - at least 2 IOLOCK_PARENT subclasses - MMAPLOCK uses xfs_lock_two_inodes(), so needs: - at least 2 MMAPLOCK subclasses - ILOCK uses xfs_lock_inodes with up to 5 inodes, so needs: - at least 5 ILOCK subclasses - one ILOCK_PARENT subclass - one RTBITMAP subclass - one RTSUM subclass For the IOLOCK, split the space into two sets of subclasses. For the MMAPLOCK, just use half the space for the one subclass to match the non-parent lock classes of the IOLOCK. For the ILOCK, use 0-4 as the ILOCK subclasses, 5-7 for the remaining individual subclasses. Because they are now all different, modify xfs_lock_inumorder() to handle the nested subclasses, and to assert fail if passed an invalid subclass. Further, annotate xfs_lock_inodes() to assert fail if an invalid combination of lock primitives and inode counts are passed that would result in a lockdep subclass annotation overflow. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 68 ++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_inode.h | 85 +++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 110 insertions(+), 43 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3e343c7f347a..657f6123b445 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -164,7 +164,7 @@ xfs_ilock( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); @@ -212,7 +212,7 @@ xfs_ilock_nowait( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { if (!mrtryupdate(&ip->i_iolock)) @@ -281,7 +281,7 @@ xfs_iunlock( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) @@ -364,30 +364,38 @@ int xfs_lock_delays; /* * Bump the subclass so xfs_lock_inodes() acquires each lock with a different - * value. This shouldn't be called for page fault locking, but we also need to - * ensure we don't overrun the number of lockdep subclasses for the iolock or - * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. + * value. This can be called for any type of inode lock combination, including + * parent locking. Care must be taken to ensure we don't overrun the subclass + * storage fields in the class mask we build. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { + int class = 0; + + ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | + XFS_ILOCK_RTSUM))); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(subclass + XFS_LOCK_INUMORDER < - (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); + ASSERT(subclass + XFS_IOLOCK_PARENT_VAL < + MAX_LOCKDEP_SUBCLASSES); + class += subclass << XFS_IOLOCK_SHIFT; + if (lock_mode & XFS_IOLOCK_PARENT) + class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { - ASSERT(subclass + XFS_LOCK_INUMORDER < - (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << - XFS_MMAPLOCK_SHIFT; + ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); + class += subclass << XFS_MMAPLOCK_SHIFT; } - if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { + ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); + class += subclass << XFS_ILOCK_SHIFT; + } - return lock_mode; + return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; } /* @@ -399,6 +407,11 @@ xfs_lock_inumorder(int lock_mode, int subclass) * transaction (such as truncate). This can result in deadlock since the long * running trans might need to wait for the inode we just locked in order to * push the tail and free space in the log. + * + * xfs_lock_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_inodes( @@ -409,8 +422,29 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - /* currently supports between 2 and 5 inodes */ + /* + * Currently supports between 2 and 5 inodes with exclusive locking. We + * support an arbitrary depth of locking here, but absolute limits on + * inodes depend on the the type of locking and the limits placed by + * lockdep annotations in xfs_lock_inumorder. These are all checked by + * the asserts. + */ ASSERT(ips && inodes >= 2 && inodes <= 5); + ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | + XFS_ILOCK_EXCL)); + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | + XFS_ILOCK_SHARED))); + ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || + inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || + inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || + inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); + + if (lock_mode & XFS_IOLOCK_EXCL) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); + } else if (lock_mode & XFS_MMAPLOCK_EXCL) + ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); try_lock = 0; i = 0; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f22d20368d8..ca9e11989cbd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -284,9 +284,9 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * Flags for lockdep annotations. * * XFS_LOCK_PARENT - for directory operations that require locking a - * parent directory inode and a child entry inode. The parent gets locked - * with this flag so it gets a lockdep subclass of 1 and the child entry - * lock will have a lockdep subclass of 0. + * parent directory inode and a child entry inode. IOLOCK requires nesting, + * MMAPLOCK does not support this class, ILOCK requires a single subclass + * to differentiate parent from child. * * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary * inodes do not participate in the normal lock order, and thus have their @@ -295,30 +295,63 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * XFS_LOCK_INUMORDER - for locking several inodes at the some time * with xfs_lock_inodes(). This flag is used as the starting subclass * and each subsequent lock acquired will increment the subclass by one. - * So the first lock acquired will have a lockdep subclass of 4, the - * second lock will have a lockdep subclass of 5, and so on. It is - * the responsibility of the class builder to shift this to the correct - * portion of the lock_mode lockdep mask. + * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly + * limited to the subclasses we can represent via nesting. We need at least + * 5 inodes nest depth for the ILOCK through rename, and we also have to support + * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP + * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all + * 8 subclasses supported by lockdep. + * + * This also means we have to number the sub-classes in the lowest bits of + * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep + * mask and we can't use bit-masking to build the subclasses. What a mess. + * + * Bit layout: + * + * Bit Lock Region + * 16-19 XFS_IOLOCK_SHIFT dependencies + * 20-23 XFS_MMAPLOCK_SHIFT dependencies + * 24-31 XFS_ILOCK_SHIFT dependencies + * + * IOLOCK values + * + * 0-3 subclass value + * 4-7 PARENT subclass values + * + * MMAPLOCK values + * + * 0-3 subclass value + * 4-7 unused + * + * ILOCK values + * 0-4 subclass values + * 5 PARENT subclass (not nestable) + * 6 RTBITMAP subclass (not nestable) + * 7 RTSUM subclass (not nestable) + * */ -#define XFS_LOCK_PARENT 1 -#define XFS_LOCK_RTBITMAP 2 -#define XFS_LOCK_RTSUM 3 -#define XFS_LOCK_INUMORDER 4 - -#define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) - -#define XFS_MMAPLOCK_SHIFT 20 - -#define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) - -#define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 -#define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT_VAL 4 +#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) + +#define XFS_MMAPLOCK_SHIFT 20 +#define XFS_MMAPLOCK_NUMORDER 0 +#define XFS_MMAPLOCK_MAX_SUBCLASS 3 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) +#define XFS_ILOCK_RTBITMAP_VAL 6 +#define XFS_ILOCK_RTSUM_VAL 7 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) + +#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ XFS_ILOCK_DEP_MASK) -- cgit v1.2.3 From dbad7c993053d8f482a5f76270a93307537efd8e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 19 Aug 2015 10:33:00 +1000 Subject: xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.c | 3 +++ fs/xfs/xfs_dir2_readdir.c | 11 ++++++++--- fs/xfs/xfs_inode.c | 34 +++++++++++++++++++++------------- fs/xfs/xfs_symlink.c | 7 ++++--- 4 files changed, 36 insertions(+), 19 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index a69fb3a1e161..42799d88ecc0 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -362,6 +362,7 @@ xfs_dir_lookup( struct xfs_da_args *args; int rval; int v; /* type-checking value */ + int lock_mode; ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); @@ -387,6 +388,7 @@ xfs_dir_lookup( if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; @@ -419,6 +421,7 @@ out_check_rval: } } out_free: + xfs_iunlock(dp, lock_mode); kmem_free(args); return rval; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 098cd78fe708..a989a9c7edb7 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -171,6 +171,7 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; + int lock_mode; /* * If the block number in the offset is out of range, we're done. @@ -178,7 +179,9 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(NULL, dp, &bp); + xfs_iunlock(dp, lock_mode); if (error) return error; @@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents( * current buffer, need to get another one. */ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + int lock_mode; + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, &curoff, &bp); + xfs_iunlock(dp, lock_mode); if (error || !map_info->map_valid) break; @@ -653,7 +659,6 @@ xfs_readdir( struct xfs_da_args args = { NULL }; int rval; int v; - uint lock_mode; trace_xfs_readdir(dp); @@ -666,7 +671,7 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - lock_mode = xfs_ilock_data_map_shared(dp); + xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -675,7 +680,7 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 657f6123b445..65792660b043 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -663,30 +663,29 @@ xfs_lookup( { xfs_ino_t inum; int error; - uint lock_mode; trace_xfs_lookup(dp, name); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - lock_mode = xfs_ilock_data_map_shared(dp); + xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock(dp, lock_mode); - if (error) - goto out; + goto out_unlock; error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); if (error) goto out_free_name; + xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); -out: +out_unlock: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -1183,7 +1182,8 @@ xfs_create( goto out_trans_cancel; - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_bmap_init(&free_list, &first_block); @@ -1222,7 +1222,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1295,7 +1295,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return error; } @@ -1443,10 +1443,11 @@ xfs_link( if (error) goto error_return; + xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -2549,9 +2550,10 @@ xfs_remove( goto out_trans_cancel; } + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2932,6 +2934,12 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ + if (!new_parent) + xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + else + xfs_lock_two_inodes(src_dp, target_dp, + XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2939,9 +2947,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 4be27b0210af..7a01486eff06 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -240,7 +240,8 @@ xfs_symlink( if (error) goto out_trans_cancel; - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -288,7 +289,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -421,7 +422,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return error; } -- cgit v1.2.3 From 2f123bce18943fff819bc10f8868ffb9149fc622 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 19 Aug 2015 10:33:58 +1000 Subject: libxfs: readahead of dir3 data blocks should use the read verifier In the dir3 data block readahead function, use the regular read verifier to check the block's CRC and spot-check the block contents instead of directly calling only the spot-checking routine. This prevents corrupted directory data blocks from being read into the kernel, which can lead to garbage ls output and directory loops (if say one of the entries contains slashes and other junk). cc: # 3.12 - 4.2 Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 6a57fdbc63ef..824131e71bc5 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -252,7 +252,8 @@ xfs_dir3_data_reada_verify( return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): case cpu_to_be32(XFS_DIR3_DATA_MAGIC): - xfs_dir3_data_verify(bp); + bp->b_ops = &xfs_dir3_data_buf_ops; + bp->b_ops->verify_read(bp); return; default: xfs_buf_ioerror(bp, -EFSCORRUPTED); -- cgit v1.2.3 From ffeecc5213024ae663377b442eedcfbacf6d0c5d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 19 Aug 2015 10:34:32 +1000 Subject: xfs: Fix xfs_attr_leafblock definition struct xfs_attr_leafblock contains 'entries' array which is declared with size 1 altough it can in fact contain much more entries. Since this array is followed by further struct members, gcc (at least in version 4.8.3) thinks that the array has the fixed size of 1 element and thus may optimize away all accesses beyond the end of array resulting in non-working code. This problem was only observed with userspace code in xfsprogs, however it's better to be safe in kernel as well and have matching kernel and xfsprogs definitions. cc: Signed-off-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_format.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 74bcbabfa523..b14bbd6bb05f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -680,8 +680,15 @@ typedef struct xfs_attr_leaf_name_remote { typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ - xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ - xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced and definining them can actually make gcc optimize away + * accesses to the 'entries' array above index 0 so don't do that. + * + * xfs_attr_leaf_name_local_t namelist; + * xfs_attr_leaf_name_remote_t valuelist; + */ } xfs_attr_leafblock_t; /* -- cgit v1.2.3 From 3d751af2cbe9a73a869986a18e865f8a34265052 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:35:04 +1000 Subject: xfs: flush entire file on dio read/write to cached file Filesystems are responsible to manage file coherency between the page cache and direct I/O. The generic dio code flushes dirty pages over the range of a dio to ensure that the dio read or a future buffered read returns the correct data. XFS has generally followed this pattern, though traditionally has flushed and invalidated the range from the start of the I/O all the way to the end of the file. This changed after the following commit: 7d4ea3ce xfs: use ranged writeback and invalidation for direct IO ... as the full file flush was no longer necessary to deal with the strange post-eof delalloc issues that were since fixed. Unfortunately, we have since received complaints about performance degradation due to the increased exclusive iolock cycles (which locks out parallel dio submission) that occur when a file has cached pages. This does not occur on filesystems that use the generic code as it also does not incorporate locking. The exclusive iolock is acquired any time the inode mapping has cached pages, regardless of whether they reside in the range of the I/O or not. If not, the flush/inval calls do no work and the lock was cycled for no reason. Under consideration of the cost of the exclusive iolock, update the dio read and write handlers to flush and invalidate the entire mapping when cached pages exist. In most cases, this increases the cost of the initial flush sequence but eliminates the need for further lock cycles and flushes so long as the workload does not actively mix direct and buffered I/O. This also more closely matches historical behavior and performance characteristics that users have come to expect. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f0e8249722d4..2d91ab066370 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -317,24 +317,33 @@ xfs_file_read_iter( return -EIO; /* - * Locking is a bit tricky here. If we take an exclusive lock - * for direct IO, we effectively serialise all new concurrent - * read IO to this file and block it behind IO that is currently in - * progress because IO in progress holds the IO lock shared. We only - * need to hold the lock exclusive to blow away the page cache, so - * only take lock exclusively if the page cache needs invalidation. - * This allows the normal direct IO case of no page cache pages to - * proceeed concurrently without serialisation. + * Locking is a bit tricky here. If we take an exclusive lock for direct + * IO, we effectively serialise all new concurrent read IO to this file + * and block it behind IO that is currently in progress because IO in + * progress holds the IO lock shared. We only need to hold the lock + * exclusive to blow away the page cache, so only take lock exclusively + * if the page cache needs invalidation. This allows the normal direct + * IO case of no page cache pages to proceeed concurrently without + * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + /* + * The generic dio code only flushes the range of the particular + * I/O. Because we take an exclusive lock here, this whole + * sequence is considerably more expensive for us. This has a + * noticeable performance impact for any file with cached pages, + * even when outside of the range of the particular I/O. + * + * Hence, amortize the cost of the lock against a full file + * flush and reduce the chances of repeated iolock cycles going + * forward. + */ if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - pos, pos + size - 1); + ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -345,9 +354,7 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + size - 1) >> PAGE_CACHE_SHIFT); + ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } @@ -733,19 +740,19 @@ xfs_file_dio_aio_write( pos = iocb->ki_pos; end = pos + count - 1; + /* + * See xfs_file_read_iter() for why we do a full-file flush here. + */ if (mapping->nrpages) { - ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, end); + ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. + * Invalidate whole pages. This can return an error if we fail + * to invalidate a page, but this should never happen on XFS. + * Warn if it does fail. */ - ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, - end >> PAGE_CACHE_SHIFT); + ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } -- cgit v1.2.3 From 3403ccc0c9f069c40ea751a93ac6746f5ef2116a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Aug 2015 09:27:49 +1000 Subject: xfs: inode lockdep annotations broke non-lockdep build Fix CONFIG_LOCKDEP=n build, because asserts I put in to ensure we aren't overrunning lockdep subclasses in commit 0952c81 ("xfs: clean up inode lockdep annotations") use a define that doesn't exist when CONFIG_LOCKDEP=n Only check the subclass limits when lockdep is actually enabled. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 65792660b043..aa00ccc0bd78 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -362,6 +362,17 @@ int xfs_lots_retries; int xfs_lock_delays; #endif +#ifdef CONFIG_LOCKDEP +static bool +xfs_lockdep_subclass_ok( + int subclass) +{ + return subclass < MAX_LOCKDEP_SUBCLASSES; +} +#else +#define xfs_lockdep_subclass_ok(subclass) (true) +#endif + /* * Bump the subclass so xfs_lock_inodes() acquires each lock with a different * value. This can be called for any type of inode lock combination, including @@ -375,11 +386,12 @@ xfs_lock_inumorder(int lock_mode, int subclass) ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | XFS_ILOCK_RTSUM))); + ASSERT(xfs_lockdep_subclass_ok(subclass)); if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); - ASSERT(subclass + XFS_IOLOCK_PARENT_VAL < - MAX_LOCKDEP_SUBCLASSES); + ASSERT(xfs_lockdep_subclass_ok(subclass + + XFS_IOLOCK_PARENT_VAL)); class += subclass << XFS_IOLOCK_SHIFT; if (lock_mode & XFS_IOLOCK_PARENT) class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; -- cgit v1.2.3 From c184f855c483428027d6ec937e4a9d5f15b2cbad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Aug 2015 10:05:13 +1000 Subject: xfs: Fix uninitialized return value in xfs_alloc_fix_freelist() xfs_alloc_fix_freelist() can sometimes jump to out_agbp_relse without ever setting value of 'error' variable which is then returned. This can happen e.g. when pag->pagf_init is set but AG is for metadata and we want to allocate user data. Fix the problem by initializing 'error' to 0, which is the desired return value when we decide to skip this group. CC: xfs@oss.sgi.com Coverity-id: 1309714 Signed-off-by: Jan Kara Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b7fc17ce8233..ffad7f20342f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1937,7 +1937,7 @@ xfs_alloc_fix_freelist( struct xfs_alloc_arg targs; /* local allocation arguments */ xfs_agblock_t bno; /* freelist block */ xfs_extlen_t need; /* total blocks needed in freelist */ - int error; + int error = 0; if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); -- cgit v1.2.3 From b6a9947efdbe0c9135d94b26b2f912f5b0b9dc45 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 25 Aug 2015 10:05:13 +1000 Subject: xfs: lockdep annotations throw warnings on non-debug builds SO, now if we enable lockdep without enabling CONFIG_XFS_DEBUG, the lockdep annotations throw a warning because the assert that uses the lockdep define is not built in: fs/xfs/xfs_inode.c:367:1: warning: 'xfs_lockdep_subclass_ok' defined but not used [-Wunused-function] xfs_lockdep_subclass_ok( So now we need to create an ifdef mess to sort this all out, because we need to handle all the combinations of CONFIG_XFS_DEBUG=[y|n], CONFIG_XFS_WARNING=[y|n] and CONFIG_LOCKDEP=[y|n] appropriately. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index aa00ccc0bd78..c59da0e88c5f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -362,7 +362,13 @@ int xfs_lots_retries; int xfs_lock_delays; #endif -#ifdef CONFIG_LOCKDEP +/* + * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when + * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined + * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build + * errors and warnings. + */ +#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) static bool xfs_lockdep_subclass_ok( int subclass) -- cgit v1.2.3 From 037542345a82aaaa228ec280fe6ddff1568d169f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Aug 2015 10:05:13 +1000 Subject: xfs: Fix file type directory corruption for btree directories Users have occasionally reported that file type for some directory entries is wrong. This mostly happened after updating libraries some libraries. After some debugging the problem was traced down to xfs_dir2_node_replace(). The function uses args->filetype as a file type to store in the replaced directory entry however it also calls xfs_da3_node_lookup_int() which will store file type of the current directory entry in args->filetype. Thus we fail to change file type of a directory entry to a proper type. Fix the problem by storing new file type in a local variable before calling xfs_da3_node_lookup_int(). cc: # 3.16 - 4.x Reported-by: Giacomo Comes Signed-off-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_node.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 527b7337a43b..9bd3d698ee77 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -2132,6 +2132,7 @@ xfs_dir2_node_replace( int error; /* error return value */ int i; /* btree level */ xfs_ino_t inum; /* new inode number */ + int ftype; /* new file type */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ int rval; /* internal return value */ @@ -2145,7 +2146,14 @@ xfs_dir2_node_replace( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; + + /* + * We have to save new inode number and ftype since + * xfs_da3_node_lookup_int() is going to overwrite them + */ inum = args->inumber; + ftype = args->filetype; + /* * Lookup the entry to change in the btree. */ @@ -2183,7 +2191,7 @@ xfs_dir2_node_replace( * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - args->dp->d_ops->data_put_ftype(dep, args->filetype); + args->dp->d_ops->data_put_ftype(dep, ftype); xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } -- cgit v1.2.3 From 2ccf4a9b18868b0900072e6d5d15a04254a07345 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 25 Aug 2015 10:05:13 +1000 Subject: xfs: collapse allocsize and biosize mount option handling The allocsize and biosize mount options are handled identically, other than allocsize accepting suffixes. suffix_kstrtoint handles bare numbers just fine too, so these can be collapsed. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_super.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f98ce83b7bc4..3bf503a3f57e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -261,16 +261,8 @@ xfs_parseargs( mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); if (!mp->m_rtname) return -ENOMEM; - } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (kstrtoint(value, 10, &iosize)) - return -EINVAL; - iosizelog = ffs(iosize) - 1; - } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) || + !strcmp(this_char, MNTOPT_BIOSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); -- cgit v1.2.3 From f79af0b9090895520c69fbe1939184c4f8ed8426 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 25 Aug 2015 10:05:13 +1000 Subject: xfs: fix non-debug build warnings There seem to be a couple of new set-but-unused build warnings that gcc 4.9.3 is now warning about. These are not regressions, just the compiler being more picky. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 4 +--- fs/xfs/xfs_buf_item.c | 20 +++++++++++++------- fs/xfs/xfs_buf_item.h | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a4b7d92e946c..fbf4c269a7b7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -438,7 +438,6 @@ _xfs_buf_find( xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; @@ -450,10 +449,9 @@ _xfs_buf_find( for (i = 0; i < nmaps; i++) numblks += map[i].bm_len; - numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 092d652bc03d..9cf3a86a2201 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -750,13 +750,13 @@ xfs_buf_item_free_format( * buffer (see xfs_buf_attach_iodone() below), then put the * buf log item at the front. */ -void +int xfs_buf_item_init( - xfs_buf_t *bp, - xfs_mount_t *mp) + struct xfs_buf *bp, + struct xfs_mount *mp) { - xfs_log_item_t *lip = bp->b_fspriv; - xfs_buf_log_item_t *bip; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_buf_log_item *bip; int chunks; int map_size; int error; @@ -770,12 +770,11 @@ xfs_buf_item_init( */ ASSERT(bp->b_target->bt_mount == mp); if (lip != NULL && lip->li_type == XFS_LI_BUF) - return; + return 0; bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; - xfs_buf_hold(bp); /* * chunks is the number of XFS_BLF_CHUNK size pieces the buffer @@ -788,6 +787,11 @@ xfs_buf_item_init( */ error = xfs_buf_item_get_format(bip, bp->b_map_count); ASSERT(error == 0); + if (error) { /* to stop gcc throwing set-but-unused warnings */ + kmem_zone_free(xfs_buf_item_zone, bip); + return error; + } + for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), @@ -807,6 +811,8 @@ xfs_buf_item_init( if (bp->b_fspriv) bip->bli_item.li_bio_list = bp->b_fspriv; bp->b_fspriv = bip; + xfs_buf_hold(bp); + return 0; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f3455a41510..f7eba99d19dd 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item { struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; -void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); uint xfs_buf_item_dirty(xfs_buf_log_item_t *); -- cgit v1.2.3 From dfdd4ac66c2f921ecec730a2b24b0b13e10346b2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 28 Aug 2015 14:50:03 +1000 Subject: libxfs: bad magic number should set da block buffer error If xfs_da3_node_read_verify() doesn't recognize the magic number of a buffer it's just read, set the buffer error to -EFSCORRUPTED so that the error can be sent up to userspace. Without this patch we'll notice the bad magic eventually while trying to traverse or change the block, but we really ought to fail early in the verifier. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_btree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2385f8cd08ab..8951e34ab76a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -233,6 +233,7 @@ xfs_da3_node_read_verify( bp->b_ops->verify_read(bp); return; default: + xfs_buf_ioerror(bp, -EFSCORRUPTED); break; } -- cgit v1.2.3 From c9eb256eda4420c06bb10f5e8fbdbe1a34bc98e0 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Fri, 28 Aug 2015 14:50:45 +1000 Subject: xfs: return errors from partial I/O failures to files There is an issue with xfs's error reporting in some cases of I/O partially failing and partially succeeding. Calls like fsync() can report success even though not all I/O was successful in partial-failure cases such as one disk of a RAID0 array being offline. The issue can occur when there are more than one bio per xfs_ioend struct. Each call to xfs_end_bio() for a bio completing will write a value to ioend->io_error. If a successful bio completes after any failed bio, no error is reported do to it writing 0 over the error code set by any failed bio. The I/O error information is now lost and when the ioend is completed only success is reported back up the filesystem stack. xfs_end_bio() should only set ioend->io_error in the case of BIO_UPTODATE being clear. ioend->io_error is initialized to 0 at allocation so only needs to be updated by a failed bio. Also check that ioend->io_error is 0 so that the first error reported will be the error code returned. Cc: stable@vger.kernel.org Signed-off-by: David Jeffery Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3859f5e27a4d..458fced2c0f9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -356,7 +356,8 @@ xfs_end_bio( { xfs_ioend_t *ioend = bio->bi_private; - ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + if (!ioend->io_error && !test_bit(BIO_UPTODATE, &bio->bi_flags)) + ioend->io_error = error; /* Toss bio and pass work off to an xfsdatad thread */ bio->bi_private = NULL; -- cgit v1.2.3 From 8774cf8bacd4e79b7c65cdf1208da264a9d436d2 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 28 Aug 2015 14:50:56 +1000 Subject: xfs: add mssing inode cache attempts counter increment Increasing the inode cache attempt counter was apparently dropped while refactoring the cache code and so stayed at the initial 0 value. Add the increment back to make the runtime stats more useful. Signed-off-by: Lucas Stach Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 76a9f2783282..0a326bd64d4e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -412,6 +412,8 @@ xfs_iget( if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return -EINVAL; + XFS_STATS_INC(xs_ig_attempts); + /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); -- cgit v1.2.3 From 1a7ccad88d1bcebabc011b54a2f8615175e523fc Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 28 Aug 2015 14:51:10 +1000 Subject: xfs: fix error gotos in xfs_setattr_nonsize As the code stands today, if xfs_trans_reserve() fails, we goto out_dqrele, which does not free the allocated transaction. Fix up the goto targets to undo everything properly. Addresses-Coverity-Id: 145571 Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 766b23f86ce9..8294132e6a3c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -609,7 +609,7 @@ xfs_setattr_nonsize( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) - goto out_dqrele; + goto out_trans_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -640,7 +640,7 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_trans_cancel; + goto out_unlock; } } @@ -729,10 +729,10 @@ xfs_setattr_nonsize( return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); out_trans_cancel: xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; -- cgit v1.2.3