diff options
Diffstat (limited to 'fs/xfs')
110 files changed, 5084 insertions, 1805 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index f42fcf1b5465..46bcf0e649f5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -48,9 +48,6 @@ config XFS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N. config XFS_RT diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 0da80019a917..c02781a4c091 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -167,7 +167,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -static int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -520,7 +520,7 @@ xfs_alloc_fixup_trees( return 0; } -static bool +static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) { @@ -528,10 +528,19 @@ xfs_agfl_verify( struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); int i; + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return NULL; + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) - return false; + return __this_address; /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -539,16 +548,17 @@ xfs_agfl_verify( * so we can detect and avoid this problem. */ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) - return false; + return __this_address; for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) - return false; + return __this_address; } - return xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)); + if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) + return __this_address; + return NULL; } static void @@ -556,6 +566,7 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; /* * There is no verification of non-crc AGFLs because mkfs does not @@ -567,28 +578,29 @@ xfs_agfl_read_verify( return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_agfl_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_agfl_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void xfs_agfl_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; /* no verification of non-crc AGFLs */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_agfl_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_agfl_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -602,6 +614,7 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, + .verify_struct = xfs_agfl_verify, }; /* @@ -702,7 +715,7 @@ xfs_alloc_ag_vextent( ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ - if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, args->agbno, args->len, &args->oinfo); if (error) @@ -1682,7 +1695,7 @@ xfs_free_ag_extent( bno_cur = cnt_cur = NULL; mp = tp->t_mountp; - if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(oinfo)) { error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); if (error) goto error0; @@ -2397,19 +2410,19 @@ xfs_alloc_put_freelist( return 0; } -static bool +static xfs_failaddr_t xfs_agf_verify( - struct xfs_mount *mp, - struct xfs_buf *bp) - { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) - return false; + return __this_address; } if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && @@ -2418,18 +2431,18 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) - return false; + return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) - return false; + return __this_address; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) - return false; + return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -2438,18 +2451,18 @@ xfs_agf_verify( * so we can detect and avoid this problem. */ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) - return false; + return __this_address; if (xfs_sb_version_haslazysbcount(&mp->m_sb) && be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) - return false; + return __this_address; if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) - return false; + return __this_address; - return true;; + return NULL; } @@ -2458,28 +2471,29 @@ xfs_agf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, - XFS_ERRTAG_ALLOC_READ_AGF)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_agf_verify(bp); + if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void xfs_agf_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; - if (!xfs_agf_verify(mp, bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_agf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -2496,6 +2510,7 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, + .verify_struct = xfs_agf_verify, }; /* @@ -2981,3 +2996,22 @@ xfs_verify_fsbno( return false; return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); } + +/* Is there a record covering a given extent? */ +int +xfs_alloc_has_record( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + union xfs_btree_irec low; + union xfs_btree_irec high; + + memset(&low, 0, sizeof(low)); + low.a.ar_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.a.ar_startblock = bno + len - 1; + + return xfs_btree_has_record(cur, &low, &high, exists); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7ba2d129d504..65a0cafe06e4 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -198,6 +198,13 @@ xfs_free_extent( enum xfs_ag_resv_type type); /* block reservation type */ int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -237,4 +244,7 @@ bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno); bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, bool *exist); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index cfde0a0f9706..6840b588187e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -307,13 +307,14 @@ xfs_cntbt_diff_two_keys( be32_to_cpu(k2->alloc.ar_startblock); } -static bool +static xfs_failaddr_t xfs_allocbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; + xfs_failaddr_t fa; unsigned int level; /* @@ -331,29 +332,31 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return false; + return __this_address; } else if (level >= mp->m_ag_maxlevels) - return false; + return __this_address; break; case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return false; + return __this_address; } else if (level >= mp->m_ag_maxlevels) - return false; + return __this_address; break; default: - return false; + return __this_address; } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); @@ -363,25 +366,30 @@ static void xfs_allocbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_allocbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_allocbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - if (!xfs_allocbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_allocbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -392,6 +400,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6249c92671de..ce4a34a2751d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -212,6 +212,7 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_defer_ops dfops; struct xfs_trans_res tres; @@ -327,9 +328,16 @@ xfs_attr_set( * GROT: another possible req'mt for a double-split btree op. */ xfs_defer_init(args.dfops, args.firstblock); - error = xfs_attr_shortform_to_leaf(&args); + error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); if (error) goto out_defer_cancel; + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args.trans, leaf_bp); + xfs_defer_bjoin(args.dfops, leaf_bp); xfs_defer_ijoin(args.dfops, dp); error = xfs_defer_finish(&args.trans, args.dfops); if (error) @@ -337,13 +345,14 @@ xfs_attr_set( /* * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf. + * transaction to add the new attribute to the leaf, which + * means that we have to hold & join the leaf buffer here too. */ - error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; - + xfs_trans_bjoin(args.trans, leaf_bp); + leaf_bp = NULL; } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -374,8 +383,9 @@ xfs_attr_set( out_defer_cancel: xfs_defer_cancel(&dfops); - args.trans = NULL; out: + if (leaf_bp) + xfs_trans_brelse(args.trans, leaf_bp); if (args.trans) xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); @@ -707,7 +717,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) return error; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; return error; } @@ -760,7 +769,6 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) return 0; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; return error; } @@ -1035,7 +1043,6 @@ out: return retval; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; goto out; } @@ -1176,7 +1183,6 @@ out: return error; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 53cc8b986eac..2135b8e67dcc 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -247,14 +247,15 @@ xfs_attr3_leaf_hdr_to_disk( } } -static bool +static xfs_failaddr_t xfs_attr3_leaf_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr_leafblock *leaf = bp->b_addr; - struct xfs_perag *pag = bp->b_pag; - struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_perag *pag = bp->b_pag; + struct xfs_attr_leaf_entry *entries; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -262,17 +263,17 @@ xfs_attr3_leaf_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return false; + return __this_address; if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return false; + return __this_address; } else { if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return false; + return __this_address; } /* * In recovery there is a transient state where count == 0 is valid @@ -280,12 +281,27 @@ xfs_attr3_leaf_verify( * if the attr didn't fit in shortform. */ if (pag && pag->pagf_init && ichdr.count == 0) - return false; + return __this_address; + + /* + * firstused is the block offset of the first name info structure. + * Make sure it doesn't go off the block or crash into the header. + */ + if (ichdr.firstused > mp->m_attr_geo->blksize) + return __this_address; + if (ichdr.firstused < xfs_attr3_leaf_hdr_size(leaf)) + return __this_address; + + /* Make sure the entries array doesn't crash into the name info. */ + entries = xfs_attr3_leaf_entryp(bp->b_addr); + if ((char *)&entries[ichdr.count] > + (char *)bp->b_addr + ichdr.firstused) + return __this_address; /* XXX: need to range check rest of attr header values */ /* XXX: hash order check? */ - return true; + return NULL; } static void @@ -293,12 +309,13 @@ xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_attr3_leaf_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_attr3_leaf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -322,21 +339,23 @@ xfs_attr3_leaf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_attr3_leaf_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_attr3_leaf_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, + .verify_struct = xfs_attr3_leaf_verify, }; int @@ -735,10 +754,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) } /* - * Convert from using the shortform to the leaf. + * Convert from using the shortform to the leaf. On success, return the + * buffer so that we can keep it locked until we're totally done with it. */ int -xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +xfs_attr_shortform_to_leaf( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) { xfs_inode_t *dp; xfs_attr_shortform_t *sf; @@ -818,7 +840,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } error = 0; - + *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; @@ -867,6 +889,80 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } +/* Verify the consistency of an inline attribute fork. */ +xfs_failaddr_t +xfs_attr_shortform_verify( + struct xfs_inode *ip) +{ + struct xfs_attr_shortform *sfp; + struct xfs_attr_sf_entry *sfep; + struct xfs_attr_sf_entry *next_sfep; + char *endp; + struct xfs_ifork *ifp; + int i; + int size; + + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); + ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + size = ifp->if_bytes; + + /* + * Give up if the attribute is way too short. + */ + if (size < sizeof(struct xfs_attr_sf_hdr)) + return __this_address; + + endp = (char *)sfp + size; + + /* Check all reported entries */ + sfep = &sfp->list[0]; + for (i = 0; i < sfp->hdr.count; i++) { + /* + * struct xfs_attr_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + if (((char *)sfep + sizeof(*sfep)) >= endp) + return __this_address; + + /* Don't allow names with known bad length. */ + if (sfep->namelen == 0) + return __this_address; + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + if ((char *)next_sfep > endp) + return __this_address; + + /* + * Check for unknown flags. Short form doesn't support + * the incomplete or local bits, so we can use the namespace + * mask here. + */ + if (sfep->flags & ~XFS_ATTR_NSP_ONDISK_MASK) + return __this_address; + + /* + * Check for invalid namespace combinations. We only allow + * one namespace flag per xattr, so we can just count the + * bits (i.e. hweight) here. + */ + if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + return __this_address; + + sfep = next_sfep; + } + if ((void *)sfep != (void *)endp) + return __this_address; + + return NULL; +} + /* * Convert a leaf attribute list to shortform attribute list */ @@ -2170,7 +2266,8 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - ASSERT(ichdr.count < args->geo->blksize / 8); + if (ichdr.count >= args->geo->blksize / 8) + return -EFSCORRUPTED; /* * Binary search. (note: small blocks will skip this loop) @@ -2186,8 +2283,10 @@ xfs_attr3_leaf_lookup_int( else break; } - ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); - ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); + if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) + return -EFSCORRUPTED; + if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) + return -EFSCORRUPTED; /* * Since we may have duplicate hashval's, find the first matching diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f7dda0c237b0..4da08af5b134 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -48,10 +48,12 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, + struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d56caf037ca0..21be186067a2 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -65,7 +65,7 @@ xfs_attr3_rmt_blocks( * does CRC, location and bounds checking, the unpacking function checks the * attribute parameters and owner. */ -static bool +static xfs_failaddr_t xfs_attr3_rmt_hdr_ok( void *ptr, xfs_ino_t ino, @@ -76,19 +76,19 @@ xfs_attr3_rmt_hdr_ok( struct xfs_attr3_rmt_hdr *rmt = ptr; if (bno != be64_to_cpu(rmt->rm_blkno)) - return false; + return __this_address; if (offset != be32_to_cpu(rmt->rm_offset)) - return false; + return __this_address; if (size != be32_to_cpu(rmt->rm_bytes)) - return false; + return __this_address; if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + return __this_address; /* ok */ - return true; + return NULL; } -static bool +static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, void *ptr, @@ -98,27 +98,29 @@ xfs_attr3_rmt_verify( struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; + return __this_address; if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(rmt->rm_blkno) != bno) - return false; + return __this_address; if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) - return false; + return __this_address; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) - return false; + return __this_address; if (rmt->rm_owner == 0) - return false; + return __this_address; - return true; + return NULL; } -static void -xfs_attr3_rmt_read_verify( - struct xfs_buf *bp) +static int +__xfs_attr3_rmt_read_verify( + struct xfs_buf *bp, + bool check_crc, + xfs_failaddr_t *failaddr) { struct xfs_mount *mp = bp->b_target->bt_mount; char *ptr; @@ -128,7 +130,7 @@ xfs_attr3_rmt_read_verify( /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; + return 0; ptr = bp->b_addr; bno = bp->b_bn; @@ -136,23 +138,48 @@ xfs_attr3_rmt_read_verify( ASSERT(len >= blksize); while (len > 0) { - if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { - xfs_buf_ioerror(bp, -EFSBADCRC); - break; - } - if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - break; + if (check_crc && + !xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + *failaddr = __this_address; + return -EFSBADCRC; } + *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + if (*failaddr) + return -EFSCORRUPTED; len -= blksize; ptr += blksize; bno += BTOBB(blksize); } - if (bp->b_error) - xfs_verifier_error(bp); - else - ASSERT(len == 0); + if (len != 0) { + *failaddr = __this_address; + return -EFSCORRUPTED; + } + + return 0; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error; + + error = __xfs_attr3_rmt_read_verify(bp, true, &fa); + if (error) + xfs_verifier_error(bp, error, fa); +} + +static xfs_failaddr_t +xfs_attr3_rmt_verify_struct( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error; + + error = __xfs_attr3_rmt_read_verify(bp, false, &fa); + return error ? fa : NULL; } static void @@ -160,6 +187,7 @@ xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; int blksize = mp->m_attr_geo->blksize; char *ptr; int len; @@ -177,9 +205,9 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -188,8 +216,7 @@ xfs_attr3_rmt_write_verify( * xfs_attr3_rmt_hdr_set() for the explanation. */ if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); return; } xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); @@ -198,13 +225,16 @@ xfs_attr3_rmt_write_verify( ptr += blksize; bno += BTOBB(blksize); } - ASSERT(len == 0); + + if (len != 0) + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, + .verify_struct = xfs_attr3_rmt_verify_struct, }; STATIC int @@ -269,7 +299,7 @@ xfs_attr_rmtval_copyout( byte_cnt = min(*valuelen, byte_cnt); if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, + if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 08df809e2315..daae00ed30c5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -400,7 +400,7 @@ xfs_bmap_check_leaf_extents( pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(mp, - XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_verify_fsbno(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -1220,7 +1220,7 @@ xfs_iread_extents( pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(mp, - XFS_FSB_SANITY_CHECK(mp, bno), out_brelse); + xfs_verify_fsbno(mp, bno), out_brelse); xfs_trans_brelse(tp, bp); } @@ -3337,6 +3337,49 @@ xfs_bmap_btalloc_filestreams( return 0; } +/* Update all inode and quota accounting for the allocation we just did. */ +static void +xfs_bmap_btalloc_accounting( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) +{ + if (ap->flags & XFS_BMAPI_COWFORK) { + /* + * COW fork blocks are in-core only and thus are treated as + * in-core quota reservation (like delalloc blocks) even when + * converted to real blocks. The quota reservation is not + * accounted to disk until blocks are remapped to the data + * fork. So if these blocks were previously delalloc, we + * already have quota reservation and there's nothing to do + * yet. + */ + if (ap->wasdel) + return; + + /* + * Otherwise, we've allocated blocks in a hole. The transaction + * has acquired in-core quota reservation for this extent. + * Rather than account these as real blocks, however, we reduce + * the transaction quota reservation based on the allocation. + * This essentially transfers the transaction quota reservation + * to that of a delalloc extent. + */ + ap->ip->i_delayed_blks += args->len; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, + -(long)args->len); + return; + } + + /* data/attr fork only */ + ap->ip->i_d.di_nblocks += args->len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args->len; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, + args->len); +} + STATIC int xfs_bmap_btalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -3347,6 +3390,8 @@ xfs_bmap_btalloc( xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_agnumber_t ag; xfs_alloc_arg_t args; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; xfs_extlen_t blen; xfs_extlen_t nextminlen = 0; int nullfb; /* true if ap->firstblock isn't set */ @@ -3356,6 +3401,8 @@ xfs_bmap_btalloc( int stripe_align; ASSERT(ap->length); + orig_offset = ap->offset; + orig_length = ap->length; mp = ap->ip->i_mount; @@ -3571,19 +3618,23 @@ xfs_bmap_btalloc( *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; - if (!(ap->flags & XFS_BMAPI_COWFORK)) - ap->ip->i_d.di_nblocks += args.len; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= args.len; /* - * Adjust the disk quota also. This was reserved - * earlier. + * If the extent size hint is active, we tried to round the + * caller's allocation request offset down to extsz and the + * length up to another extsz boundary. If we found a free + * extent we mapped it in starting at this new offset. If the + * newly mapped space isn't long enough to cover any of the + * range of offsets that was originally requested, move the + * mapping up so that we can fill as much of the caller's + * original request as possible. Free space is apparently + * very fragmented so we're unlikely to be able to satisfy the + * hints anyway. */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : - XFS_TRANS_DQ_BCOUNT, - (long) args.len); + if (ap->length <= orig_length) + ap->offset = orig_offset; + else if (ap->offset + ap->length < orig_offset + orig_length) + ap->offset = orig_offset + orig_length - ap->length; + xfs_bmap_btalloc_accounting(ap, &args); } else { ap->blkno = NULLFSBLOCK; ap->length = 0; @@ -3876,8 +3927,6 @@ xfs_bmapi_reserve_delalloc( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; - char rt = XFS_IS_REALTIME_INODE(ip); - xfs_extlen_t extsz; int error; xfs_fileoff_t aoff = off; @@ -3892,31 +3941,25 @@ xfs_bmapi_reserve_delalloc( prealloc = alen - len; /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) - extsz = xfs_get_cowextsz_hint(ip); - else - extsz = xfs_get_extsz_hint(ip); - if (extsz) { + if (whichfork == XFS_COW_FORK) { struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) prev.br_startoff = NULLFILEOFF; - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, 1, 0, &aoff, &alen); ASSERT(!error); } - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - /* * Make a transaction-less quota reservation for delayed allocation * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + XFS_QMOPT_RES_REGBLKS); if (error) return error; @@ -3927,12 +3970,7 @@ xfs_bmapi_reserve_delalloc( indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); ASSERT(indlen > 0); - if (rt) { - error = xfs_mod_frextents(mp, -((int64_t)extsz)); - } else { - error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); - } - + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); if (error) goto out_unreserve_quota; @@ -3963,14 +4001,11 @@ xfs_bmapi_reserve_delalloc( return 0; out_unreserve_blocks: - if (rt) - xfs_mod_frextents(mp, extsz); - else - xfs_mod_fdblocks(mp, alen, false); + xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, + XFS_QMOPT_RES_REGBLKS); return error; } @@ -4304,8 +4339,16 @@ xfs_bmapi_write( while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; - /* in hole or beyoned EOF? */ + /* in hole or beyond EOF? */ if (eof || bma.got.br_startoff > bno) { + /* + * CoW fork conversions should /never/ hit EOF or + * holes. There should always be something for us + * to work on. + */ + ASSERT(!((flags & XFS_BMAPI_CONVERT) && + (flags & XFS_BMAPI_COWFORK))); + if (flags & XFS_BMAPI_DELALLOC) { /* * For the COW fork we can reasonably get a @@ -4824,6 +4867,7 @@ xfs_bmap_del_extent_cow( xfs_iext_insert(ip, icur, &new, state); break; } + ip->i_delayed_blks -= del->br_blockcount; } /* @@ -5136,7 +5180,7 @@ __xfs_bunmapi( * blowing out the transaction with a mix of EFIs and reflink * adjustments. */ - if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); else max_len = len; @@ -5662,7 +5706,8 @@ xfs_bmap_collapse_extents( *done = true; goto del_cursor; } - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), + del_cursor); new_startoff = got.br_startoff - offset_shift_fsb; if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { @@ -5767,7 +5812,8 @@ xfs_bmap_insert_extents( goto del_cursor; } } - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), + del_cursor); if (stop_fsb >= got.br_startoff + got.br_blockcount) { error = -EIO; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index c10aecaaae44..9faf479aba49 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -425,33 +425,29 @@ xfs_bmbt_diff_two_keys( be64_to_cpu(k2->bmbt.br_startoff); } -static bool +static xfs_failaddr_t xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; unsigned int level; switch (block->bb_magic) { case cpu_to_be32(XFS_BMAP_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) - return false; /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. */ - if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) - return false; + fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_BMAP_MAGIC): break; default: - return false; + return __this_address; } /* @@ -463,46 +459,39 @@ xfs_bmbt_verify( */ level = be16_to_cpu(block->bb_level); if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) - return false; - if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.l.bb_leftsib || - (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) - return false; - if (!block->bb_u.l.bb_rightsib || - (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) - return false; - - return true; + return __this_address; + + return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]); } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_lblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_bmbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_bmbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - if (!xfs_bmbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_bmbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_lblock_calc_crc(bp); @@ -512,6 +501,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, + .verify_struct = xfs_bmbt_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5f33adf8eecb..79ee4a1951d1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -273,7 +273,7 @@ xfs_btree_lblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) return; @@ -311,7 +311,7 @@ xfs_btree_sblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) return; @@ -329,7 +329,7 @@ xfs_btree_sblock_verify_crc( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) - return false; + return __this_address; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } @@ -853,7 +853,7 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + if (!xfs_verify_fsbno(mp, fsbno)) return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, @@ -4529,6 +4529,51 @@ xfs_btree_change_owner( &bbcoi); } +/* Verify the v5 fields of a long-format btree block. */ +xfs_failaddr_t +xfs_btree_lblock_v5hdr_verify( + struct xfs_buf *bp, + uint64_t owner) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return __this_address; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn)) + return __this_address; + if (owner != XFS_RMAP_OWN_UNKNOWN && + be64_to_cpu(block->bb_u.l.bb_owner) != owner) + return __this_address; + return NULL; +} + +/* Verify a long-format btree block. */ +xfs_failaddr_t +xfs_btree_lblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) + return __this_address; + + return NULL; +} + /** * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block @@ -4537,7 +4582,7 @@ xfs_btree_change_owner( * @max_recs: pointer to the m_*_mxr max records field in the xfs mount * @pag_max_level: pointer to the per-ag max level field */ -bool +xfs_failaddr_t xfs_btree_sblock_v5hdr_verify( struct xfs_buf *bp) { @@ -4546,14 +4591,14 @@ xfs_btree_sblock_v5hdr_verify( struct xfs_perag *pag = bp->b_pag; if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; + return __this_address; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; + return __this_address; if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) - return false; - return true; + return __this_address; + return NULL; } /** @@ -4562,29 +4607,29 @@ xfs_btree_sblock_v5hdr_verify( * @bp: buffer containing the btree block * @max_recs: maximum records allowed in this btree node */ -bool +xfs_failaddr_t xfs_btree_sblock_verify( struct xfs_buf *bp, unsigned int max_recs) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_agblock_t agno; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) - return false; + return __this_address; /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; + agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) + return __this_address; + if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) + return __this_address; - return true; + return NULL; } /* @@ -4953,3 +4998,33 @@ xfs_btree_diff_two_ptrs( return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } + +/* If there's an extent, we're done. */ +STATIC int +xfs_btree_has_record_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* Is there a record covering a given range of keys? */ +int +xfs_btree_has_record( + struct xfs_btree_cur *cur, + union xfs_btree_irec *low, + union xfs_btree_irec *high, + bool *exists) +{ + int error; + + error = xfs_btree_query_range(cur, low, high, + &xfs_btree_has_record_helper, NULL); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + *exists = true; + return 0; + } + *exists = false; + return error; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index b57501c6f71d..50440b5618e8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -473,10 +473,6 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -#define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ - XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) - /* * Trace hooks. Currently not implemented as they need to be ported * over to the generic tracing functionality, which is some effort. @@ -496,8 +492,14 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_BTREE_TRACE_ARGR(c, r) #define XFS_BTREE_TRACE_CURSOR(c, t) -bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); -bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); +xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, + unsigned int max_recs); +xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, + uint64_t owner); +xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, + unsigned int max_recs); + uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, @@ -545,5 +547,7 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); +int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, + union xfs_btree_irec *high, bool *exists); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 651611530d2f..ea187b4a7991 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -128,7 +128,7 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } -static bool +static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) { @@ -145,24 +145,24 @@ xfs_da3_node_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return false; + return __this_address; if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return false; + return __this_address; } else { if (ichdr.magic != XFS_DA_NODE_MAGIC) - return false; + return __this_address; } if (ichdr.level == 0) - return false; + return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) - return false; + return __this_address; if (ichdr.count == 0) - return false; + return __this_address; /* * we don't know if the node is for and attribute or directory tree, @@ -170,11 +170,11 @@ xfs_da3_node_verify( */ if (ichdr.count > mp->m_dir_geo->node_ents && ichdr.count > mp->m_attr_geo->node_ents) - return false; + return __this_address; /* XXX: hash order check? */ - return true; + return NULL; } static void @@ -182,12 +182,13 @@ xfs_da3_node_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_da3_node_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_da3_node_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -211,19 +212,20 @@ xfs_da3_node_read_verify( struct xfs_buf *bp) { struct xfs_da_blkinfo *info = bp->b_addr; + xfs_failaddr_t fa; switch (be16_to_cpu(info->magic)) { case XFS_DA3_NODE_MAGIC: if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { - xfs_buf_ioerror(bp, -EFSBADCRC); + xfs_verifier_error(bp, -EFSBADCRC, + __this_address); break; } /* fall through */ case XFS_DA_NODE_MAGIC: - if (!xfs_da3_node_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - break; - } + fa = xfs_da3_node_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: @@ -236,18 +238,40 @@ xfs_da3_node_read_verify( bp->b_ops->verify_read(bp); return; default: - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); break; } +} + +/* Verify the structure of a da3 block. */ +static xfs_failaddr_t +xfs_da3_node_verify_struct( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; - /* corrupt block */ - xfs_verifier_error(bp); + switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + case XFS_DA_NODE_MAGIC: + return xfs_da3_node_verify(bp); + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; + return bp->b_ops->verify_struct(bp); + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; + return bp->b_ops->verify_struct(bp); + default: + return __this_address; + } } const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, + .verify_struct = xfs_da3_node_verify_struct, }; int diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 3771edcb301d..7e77299b7789 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -875,4 +875,10 @@ struct xfs_attr3_rmt_hdr { ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ sizeof(struct xfs_attr3_rmt_hdr) : 0)) +/* Number of bytes in a directory block. */ +static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) +{ + return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); +} + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 072ebfe1d6ae..087fea02c389 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -249,6 +249,10 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + /* Hold the (previously bjoin'd) buffer locked across the roll. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) + xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ @@ -264,6 +268,12 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) { + xfs_trans_bjoin(*tp, dop->dop_bufs[i]); + xfs_trans_bhold(*tp, dop->dop_bufs[i]); + } + return error; } @@ -295,6 +305,31 @@ xfs_defer_ijoin( } } + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Add this buffer to the deferred op. Each joined buffer is relogged + * each time we roll the transaction. + */ +int +xfs_defer_bjoin( + struct xfs_defer_ops *dop, + struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) { + if (dop->dop_bufs[i] == bp) + return 0; + else if (dop->dop_bufs[i] == NULL) { + dop->dop_bufs[i] = bp; + return 0; + } + } + + ASSERT(0); return -EFSCORRUPTED; } @@ -493,9 +528,7 @@ xfs_defer_init( struct xfs_defer_ops *dop, xfs_fsblock_t *fbp) { - dop->dop_committed = false; - dop->dop_low = false; - memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + memset(dop, 0, sizeof(struct xfs_defer_ops)); *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d4f046dd44bd..045beacdd37d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,7 @@ enum xfs_defer_ops_type { }; #define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ struct xfs_defer_ops { bool dop_committed; /* did any trans commit? */ @@ -66,8 +67,9 @@ struct xfs_defer_ops { struct list_head dop_intake; /* unlogged pending work */ struct list_head dop_pending; /* logged pending work */ - /* relog these inodes with each roll */ + /* relog these with each roll */ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; + struct xfs_buf *dop_bufs[XFS_DEFER_OPS_NR_BUFS]; }; void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, @@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp); /* Description of a deferred type. */ struct xfs_defer_op_type { diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index e10778c102ea..92f94e190f04 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -119,8 +119,7 @@ xfs_da_mount( ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); - ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= - XFS_MAX_BLOCKSIZE); + ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); @@ -140,7 +139,7 @@ xfs_da_mount( dageo = mp->m_dir_geo; dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; dageo->fsblog = mp->m_sb.sb_blocklog; - dageo->blksize = 1 << dageo->blklog; + dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; /* diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 1a8f2cf977ca..388d67c5c903 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -340,5 +340,7 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) #define XFS_READDIR_BUFSIZE (32768) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); +void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, + struct xfs_dir2_data_hdr *hdr); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 43c902f7a68d..2da86a394bcf 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -58,7 +58,7 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } -static bool +static xfs_failaddr_t xfs_dir3_block_verify( struct xfs_buf *bp) { @@ -67,20 +67,18 @@ xfs_dir3_block_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) - return false; + return __this_address; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return false; + return __this_address; } - if (__xfs_dir3_data_check(NULL, bp)) - return false; - return true; + return __xfs_dir3_data_check(NULL, bp); } static void @@ -88,15 +86,16 @@ xfs_dir3_block_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_block_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_block_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -104,12 +103,13 @@ xfs_dir3_block_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_block_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_block_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -126,6 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, + .verify_struct = xfs_dir3_block_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 8727a43115ef..920279485275 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -36,9 +36,9 @@ /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Return 0 is the buffer is good, otherwise an error. + * Return NULL if the buffer is good, otherwise the address of the error. */ -int +xfs_failaddr_t __xfs_dir3_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ @@ -73,6 +73,14 @@ __xfs_dir3_data_check( */ ops = xfs_dir_get_ops(mp, dp); + /* + * If this isn't a directory, or we don't get handed the dir ops, + * something is seriously wrong. Bail out. + */ + if ((dp && !S_ISDIR(VFS_I(dp)->i_mode)) || + ops != xfs_dir_get_ops(mp, NULL)) + return __this_address; + hdr = bp->b_addr; p = (char *)ops->data_entry_p(hdr); @@ -81,7 +89,6 @@ __xfs_dir3_data_check( case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); - endp = (char *)lep; /* * The number of leaf entries is limited by the size of the @@ -90,17 +97,19 @@ __xfs_dir3_data_check( * so just ensure that the count falls somewhere inside the * block right now. */ - XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < - ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); + if (be32_to_cpu(btp->count) >= + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)) + return __this_address; break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - endp = (char *)hdr + geo->blksize; break; default: - XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; + return __this_address; } + endp = xfs_dir3_data_endp(geo, hdr); + if (!endp) + return __this_address; /* * Account for zero bestfree entries. @@ -108,22 +117,25 @@ __xfs_dir3_data_check( bf = ops->data_bestfree_p(hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { - XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); + if (bf[0].offset) + return __this_address; freeseen |= 1 << 0; } if (!bf[1].length) { - XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); + if (bf[1].offset) + return __this_address; freeseen |= 1 << 1; } if (!bf[2].length) { - XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); + if (bf[2].offset) + return __this_address; freeseen |= 1 << 2; } - XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= - be16_to_cpu(bf[1].length)); - XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= - be16_to_cpu(bf[2].length)); + if (be16_to_cpu(bf[0].length) < be16_to_cpu(bf[1].length)) + return __this_address; + if (be16_to_cpu(bf[1].length) < be16_to_cpu(bf[2].length)) + return __this_address; /* * Loop over the data/unused entries. */ @@ -135,22 +147,23 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); - XFS_WANT_CORRUPTED_RETURN(mp, endp >= - p + be16_to_cpu(dup->length)); - XFS_WANT_CORRUPTED_RETURN(mp, - be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + if (lastfree != 0) + return __this_address; + if (endp < p + be16_to_cpu(dup->length)) + return __this_address; + if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != + (char *)dup - (char *)hdr) + return __this_address; dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - XFS_WANT_CORRUPTED_RETURN(mp, - (freeseen & (1 << i)) == 0); + if ((freeseen & (1 << i)) != 0) + return __this_address; freeseen |= 1 << i; } else { - XFS_WANT_CORRUPTED_RETURN(mp, - be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + if (be16_to_cpu(dup->length) > + be16_to_cpu(bf[2].length)) + return __this_address; } p += be16_to_cpu(dup->length); lastfree = 1; @@ -163,16 +176,17 @@ __xfs_dir3_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); - XFS_WANT_CORRUPTED_RETURN(mp, - !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); - XFS_WANT_CORRUPTED_RETURN(mp, endp >= - p + ops->data_entsize(dep->namelen)); - XFS_WANT_CORRUPTED_RETURN(mp, - be16_to_cpu(*ops->data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); - XFS_WANT_CORRUPTED_RETURN(mp, - ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); + if (dep->namelen == 0) + return __this_address; + if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))) + return __this_address; + if (endp < p + ops->data_entsize(dep->namelen)) + return __this_address; + if (be16_to_cpu(*ops->data_entry_tag_p(dep)) != + (char *)dep - (char *)hdr) + return __this_address; + if (ops->data_get_ftype(dep) >= XFS_DIR3_FT_MAX) + return __this_address; count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || @@ -188,34 +202,52 @@ __xfs_dir3_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - XFS_WANT_CORRUPTED_RETURN(mp, - i < be32_to_cpu(btp->count)); + if (i >= be32_to_cpu(btp->count)) + return __this_address; } p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); + if (freeseen != 7) + return __this_address; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; - if (i > 0) - XFS_WANT_CORRUPTED_RETURN(mp, - be32_to_cpu(lep[i].hashval) >= - be32_to_cpu(lep[i - 1].hashval)); + if (i > 0 && be32_to_cpu(lep[i].hashval) < + be32_to_cpu(lep[i - 1].hashval)) + return __this_address; } - XFS_WANT_CORRUPTED_RETURN(mp, count == - be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); + if (count != be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)) + return __this_address; + if (stale != be32_to_cpu(btp->stale)) + return __this_address; } - return 0; + return NULL; +} + +#ifdef DEBUG +void +xfs_dir3_data_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = __xfs_dir3_data_check(dp, bp); + if (!fa) + return; + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, + bp->b_addr, __FILE__, __LINE__, fa); + ASSERT(0); } +#endif -static bool +static xfs_failaddr_t xfs_dir3_data_verify( struct xfs_buf *bp) { @@ -224,20 +256,18 @@ xfs_dir3_data_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) - return false; + return __this_address; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return false; + return __this_address; } - if (__xfs_dir3_data_check(NULL, bp)) - return false; - return true; + return __xfs_dir3_data_check(NULL, bp); } /* @@ -263,8 +293,7 @@ xfs_dir3_data_reada_verify( bp->b_ops->verify_read(bp); return; default: - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); break; } } @@ -274,15 +303,16 @@ xfs_dir3_data_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_data_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_data_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -290,12 +320,13 @@ xfs_dir3_data_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_data_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_data_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -312,6 +343,7 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, + .verify_struct = xfs_dir3_data_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { @@ -515,7 +547,6 @@ xfs_dir2_data_freescan_int( struct xfs_dir2_data_hdr *hdr, int *loghead) { - xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ struct xfs_dir2_data_free *bf; @@ -537,12 +568,7 @@ xfs_dir2_data_freescan_int( * Set up pointers. */ p = (char *)ops->data_entry_p(hdr); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - btp = xfs_dir2_block_tail_p(geo, hdr); - endp = (char *)xfs_dir2_block_leaf_p(btp); - } else - endp = (char *)hdr + geo->blksize; + endp = xfs_dir3_data_endp(geo, hdr); /* * Loop over the block's entries. */ @@ -755,17 +781,9 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - endptr = (char *)hdr + args->geo->blksize; - else { - xfs_dir2_block_tail_t *btp; /* block tail */ + endptr = xfs_dir3_data_endp(args->geo, hdr); + ASSERT(endptr != NULL); - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - btp = xfs_dir2_block_tail_p(args->geo, hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); - } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. @@ -1067,3 +1085,21 @@ xfs_dir2_data_use_free( } *needscanp = needscan; } + +/* Find the end of the entry data in a data/block format dir block. */ +void * +xfs_dir3_data_endp( + struct xfs_da_geometry *geo, + struct xfs_dir2_data_hdr *hdr) +{ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + return xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr)); + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + return (char *)hdr + geo->blksize; + default: + return NULL; + } +} diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 27297a689d9c..d7e630f41f9c 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -50,13 +50,7 @@ static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, * Pop an assert if something is wrong. */ #ifdef DEBUG -#define xfs_dir3_leaf_check(dp, bp) \ -do { \ - if (!xfs_dir3_leaf1_check((dp), (bp))) \ - ASSERT(0); \ -} while (0); - -STATIC bool +static xfs_failaddr_t xfs_dir3_leaf1_check( struct xfs_inode *dp, struct xfs_buf *bp) @@ -69,17 +63,32 @@ xfs_dir3_leaf1_check( if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return false; + return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) - return false; + return __this_address; return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); } + +static inline void +xfs_dir3_leaf_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_dir3_leaf1_check(dp, bp); + if (!fa) + return; + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, + bp->b_addr, __FILE__, __LINE__, fa); + ASSERT(0); +} #else #define xfs_dir3_leaf_check(dp, bp) #endif -bool +xfs_failaddr_t xfs_dir3_leaf_check_int( struct xfs_mount *mp, struct xfs_inode *dp, @@ -114,27 +123,27 @@ xfs_dir3_leaf_check_int( * We can deduce a value for that from di_size. */ if (hdr->count > ops->leaf_max_ents(geo)) - return false; + return __this_address; /* Leaves and bests don't overlap in leaf format. */ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || hdr->magic == XFS_DIR3_LEAF1_MAGIC) && (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) - return false; + return __this_address; /* Check hash value order, count stale entries. */ for (i = stale = 0; i < hdr->count; i++) { if (i + 1 < hdr->count) { if (be32_to_cpu(ents[i].hashval) > be32_to_cpu(ents[i + 1].hashval)) - return false; + return __this_address; } if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } if (hdr->stale != stale) - return false; - return true; + return __this_address; + return NULL; } /* @@ -142,7 +151,7 @@ xfs_dir3_leaf_check_int( * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due * to incorrect magic numbers. */ -static bool +static xfs_failaddr_t xfs_dir3_leaf_verify( struct xfs_buf *bp, uint16_t magic) @@ -160,16 +169,16 @@ xfs_dir3_leaf_verify( : XFS_DIR3_LEAFN_MAGIC; if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return false; + return __this_address; if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return false; + return __this_address; } else { if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return false; + return __this_address; } return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); @@ -181,15 +190,16 @@ __read_verify( uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_leaf_verify(bp, magic)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_leaf_verify(bp, magic); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -198,12 +208,13 @@ __write_verify( uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_leaf_verify(bp, magic)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_leaf_verify(bp, magic); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -216,6 +227,13 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } +static xfs_failaddr_t +xfs_dir3_leaf1_verify( + struct xfs_buf *bp) +{ + return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + static void xfs_dir3_leaf1_read_verify( struct xfs_buf *bp) @@ -230,6 +248,13 @@ xfs_dir3_leaf1_write_verify( __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); } +static xfs_failaddr_t +xfs_dir3_leafn_verify( + struct xfs_buf *bp) +{ + return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + static void xfs_dir3_leafn_read_verify( struct xfs_buf *bp) @@ -248,12 +273,14 @@ const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, + .verify_struct = xfs_dir3_leaf1_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, + .verify_struct = xfs_dir3_leafn_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 682e2bf370c7..239d97a64296 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -53,13 +53,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, * Check internal consistency of a leafn block. */ #ifdef DEBUG -#define xfs_dir3_leaf_check(dp, bp) \ -do { \ - if (!xfs_dir3_leafn_check((dp), (bp))) \ - ASSERT(0); \ -} while (0); - -static bool +static xfs_failaddr_t xfs_dir3_leafn_check( struct xfs_inode *dp, struct xfs_buf *bp) @@ -72,17 +66,32 @@ xfs_dir3_leafn_check( if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return false; + return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) - return false; + return __this_address; return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); } + +static inline void +xfs_dir3_leaf_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_dir3_leafn_check(dp, bp); + if (!fa) + return; + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, + bp->b_addr, __FILE__, __LINE__, fa); + ASSERT(0); +} #else #define xfs_dir3_leaf_check(dp, bp) #endif -static bool +static xfs_failaddr_t xfs_dir3_free_verify( struct xfs_buf *bp) { @@ -93,21 +102,21 @@ xfs_dir3_free_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) - return false; + return __this_address; } else { if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return false; + return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ - return true; + return NULL; } static void @@ -115,15 +124,16 @@ xfs_dir3_free_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_free_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_free_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -131,12 +141,13 @@ xfs_dir3_free_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_free_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_free_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -153,10 +164,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, + .verify_struct = xfs_dir3_free_verify, }; /* Everything ok in the free block header? */ -static bool +static xfs_failaddr_t xfs_dir3_free_header_check( struct xfs_inode *dp, xfs_dablk_t fbno, @@ -174,22 +186,22 @@ xfs_dir3_free_header_check( struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (be32_to_cpu(hdr3->firstdb) != firstdb) - return false; + return __this_address; if (be32_to_cpu(hdr3->nvalid) > maxbests) - return false; + return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) - return false; + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; if (be32_to_cpu(hdr->firstdb) != firstdb) - return false; + return __this_address; if (be32_to_cpu(hdr->nvalid) > maxbests) - return false; + return __this_address; if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) - return false; + return __this_address; } - return true; + return NULL; } static int @@ -200,6 +212,7 @@ __xfs_dir3_free_read( xfs_daddr_t mappedbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, @@ -208,9 +221,9 @@ __xfs_dir3_free_read( return err; /* Check things that we can't do in the verifier. */ - if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { - xfs_buf_ioerror(*bpp, -EFSCORRUPTED); - xfs_verifier_error(*bpp); + fa = xfs_dir3_free_header_check(dp, fbno, *bpp); + if (fa) { + xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); xfs_trans_brelse(tp, *bpp); return -EFSCORRUPTED; } @@ -1906,7 +1919,7 @@ xfs_dir2_node_addname_int( (unsigned long long)ifbno, lastfbno); if (fblk) { xfs_alert(mp, - " fblk 0x%p blkno %llu index %d magic 0x%x", + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", fblk, (unsigned long long)fblk->blkno, fblk->index, diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 4badd26c47e6..753aeeeffc18 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -39,12 +39,13 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); +extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir3_data_check(dp,bp) #endif -extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, + struct xfs_buf *bp); extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, @@ -89,8 +90,9 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); -extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, - struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); +extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp, + struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf); /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, @@ -127,7 +129,7 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern int xfs_dir2_sf_verify(struct xfs_inode *ip); +extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index be8b9755f66a..0c75a7f00883 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -156,7 +156,6 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused data pointer */ @@ -192,9 +191,8 @@ xfs_dir2_block_to_sf( /* * Set up to loop over the block's entries. */ - btp = xfs_dir2_block_tail_p(args->geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); + endptr = xfs_dir3_data_endp(args->geo, hdr); sfep = xfs_dir2_sf_firstentry(sfp); /* * Loop over the active and unused entries. @@ -630,7 +628,7 @@ xfs_dir2_sf_check( #endif /* DEBUG */ /* Verify the consistency of an inline directory. */ -int +xfs_failaddr_t xfs_dir2_sf_verify( struct xfs_inode *ip) { @@ -665,7 +663,7 @@ xfs_dir2_sf_verify( */ if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) || size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; + return __this_address; endp = (char *)sfp + size; @@ -674,7 +672,7 @@ xfs_dir2_sf_verify( i8count = ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) - return error; + return __this_address; offset = dops->data_first_offset; /* Check all reported entries */ @@ -686,11 +684,11 @@ xfs_dir2_sf_verify( * within the data buffer. */ if (((char *)sfep + sizeof(*sfep)) >= endp) - return -EFSCORRUPTED; + return __this_address; /* Don't allow names with known bad length. */ if (sfep->namelen == 0) - return -EFSCORRUPTED; + return __this_address; /* * Check that the variable-length part of the structure is @@ -699,23 +697,23 @@ xfs_dir2_sf_verify( */ next_sfep = dops->sf_nextentry(sfp, sfep); if (endp < (char *)next_sfep) - return -EFSCORRUPTED; + return __this_address; /* Check that the offsets always increase. */ if (xfs_dir2_sf_get_offset(sfep) < offset) - return -EFSCORRUPTED; + return __this_address; /* Check the inode number. */ ino = dops->sf_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) - return error; + return __this_address; /* Check the file type. */ filetype = dops->sf_get_ftype(sfep); if (filetype >= XFS_DIR3_FT_MAX) - return -EFSCORRUPTED; + return __this_address; offset = xfs_dir2_sf_get_offset(sfep) + dops->data_entsize(sfep->namelen); @@ -723,16 +721,16 @@ xfs_dir2_sf_verify( sfep = next_sfep; } if (i8count != sfp->i8count) - return -EFSCORRUPTED; + return __this_address; if ((void *)sfep != (void *)endp) - return -EFSCORRUPTED; + return __this_address; /* Make sure this whole thing ought to be in local format. */ if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize) - return -EFSCORRUPTED; + return __this_address; - return 0; + return NULL; } /* diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 747085b4ef44..8b7a6c3cb599 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -42,18 +42,14 @@ xfs_calc_dquots_per_chunk( /* * Do some primitive error checking on ondisk dquot data structures. */ -int -xfs_dqcheck( +xfs_failaddr_t +xfs_dquot_verify( struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ - uint flags, - const char *str) + uint flags) { - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got @@ -69,87 +65,57 @@ xfs_dqcheck( * This is all fine; things are still consistent, and we haven't lost * any quota information. Just don't complain about bad dquot blks. */ - if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) + return __this_address; + if (ddq->d_version != XFS_DQUOT_VERSION) + return __this_address; if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } + ddq->d_flags != XFS_DQ_GROUP) + return __this_address; - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } + if (id != -1 && id != be32_to_cpu(ddq->d_id)) + return __this_address; - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) > - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) > - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) > - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } + if (!ddq->d_id) + return NULL; + + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit) && + !ddq->d_btimer) + return __this_address; + + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit) && + !ddq->d_itimer) + return __this_address; - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit) && + !ddq->d_rtbtimer) + return __this_address; + + return NULL; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dquot_repair( + struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, + xfs_dqid_t id, + uint type) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)ddq; - if (flags & XFS_QMOPT_DOWARN) - xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); memset(d, 0, sizeof(xfs_dqblk_t)); d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); @@ -163,7 +129,7 @@ xfs_dqcheck( XFS_DQUOT_CRC_OFF); } - return errs; + return 0; } STATIC bool @@ -198,13 +164,13 @@ xfs_dquot_buf_verify_crc( return true; } -STATIC bool +STATIC xfs_failaddr_t xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp, - int warn) + struct xfs_buf *bp) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_failaddr_t fa; xfs_dqid_t id = 0; int ndquots; int i; @@ -228,33 +194,43 @@ xfs_dquot_buf_verify( */ for (i = 0; i < ndquots; i++) { struct xfs_disk_dquot *ddq; - int error; ddq = &d[i].dd_diskdq; if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); - if (error) - return false; + fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0); + if (fa) + return fa; } - return true; + + return NULL; +} + +static xfs_failaddr_t +xfs_dquot_buf_verify_struct( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + return xfs_dquot_buf_verify(mp, bp); } static void xfs_dquot_buf_read_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (!xfs_dquot_buf_verify_crc(mp, bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dquot_buf_verify(mp, bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); + } } /* @@ -270,7 +246,7 @@ xfs_dquot_buf_readahead_verify( struct xfs_mount *mp = bp->b_target->bt_mount; if (!xfs_dquot_buf_verify_crc(mp, bp) || - !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_dquot_buf_verify(mp, bp) != NULL) { xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; } @@ -283,21 +259,21 @@ xfs_dquot_buf_readahead_verify( */ static void xfs_dquot_buf_write_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; - if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); - return; - } + fa = xfs_dquot_buf_verify(mp, bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); } const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, + .verify_struct = xfs_dquot_buf_verify_struct, }; const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b90924104596..faf1a4edd618 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -233,6 +233,13 @@ typedef struct xfs_fsop_resblks { #define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL) #define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL) +/* + * Limits on sb_agblocks/sb_agblklog -- mkfs won't format AGs smaller than + * 16MB or larger than 1TB. + */ +#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */ +#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */ + /* keep the maximum size under 2^31 by a small amount */ #define XFS_MAX_LOG_BYTES \ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de3f04a98656..0e2cf5f0be1f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -920,8 +920,7 @@ STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - umode_t mode, /* bits set to indicate file type */ - int okalloc) /* ok to allocate more space */ + umode_t mode) /* bits set to indicate file type */ { xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ @@ -978,9 +977,6 @@ xfs_ialloc_ag_select( return agno; } - if (!okalloc) - goto nextag; - if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, agno, flags); if (error) @@ -1680,7 +1676,6 @@ xfs_dialloc( struct xfs_trans *tp, xfs_ino_t parent, umode_t mode, - int okalloc, struct xfs_buf **IO_agbp, xfs_ino_t *inop) { @@ -1692,6 +1687,7 @@ xfs_dialloc( int noroom = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; + int okalloc = 1; if (*IO_agbp) { /* @@ -1707,7 +1703,7 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + start_agno = xfs_ialloc_ag_select(tp, parent, mode); if (start_agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; @@ -2495,7 +2491,7 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif -static bool +static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { @@ -2504,28 +2500,28 @@ xfs_agi_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) - return false; + return __this_address; } /* * Validate the magic number of the agi block. */ if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) - return false; + return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) - return false; + return __this_address; if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) - return false; + return __this_address; if (xfs_sb_version_hasfinobt(&mp->m_sb) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) - return false; + return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -2534,10 +2530,10 @@ xfs_agi_verify( * so we can detect and avoid this problem. */ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) - return false; + return __this_address; xfs_check_agi_unlinked(agi); - return true; + return NULL; } static void @@ -2545,28 +2541,29 @@ xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, - XFS_ERRTAG_IALLOC_READ_AGI)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_agi_verify(bp); + if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void xfs_agi_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; - if (!xfs_agi_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_agi_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -2582,6 +2579,7 @@ const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, + .verify_struct = xfs_agi_verify, }; /* @@ -2755,3 +2753,102 @@ xfs_verify_dir_ino( return false; return xfs_verify_ino(mp, ino); } + +/* Is there an inode record covering a given range of inode numbers? */ +int +xfs_ialloc_has_inode_record( + struct xfs_btree_cur *cur, + xfs_agino_t low, + xfs_agino_t high, + bool *exists) +{ + struct xfs_inobt_rec_incore irec; + xfs_agino_t agino; + uint16_t holemask; + int has_record; + int i; + int error; + + *exists = false; + error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); + while (error == 0 && has_record) { + error = xfs_inobt_get_rec(cur, &irec, &has_record); + if (error || irec.ir_startino > high) + break; + + agino = irec.ir_startino; + holemask = irec.ir_holemask; + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, + i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { + if (holemask & 1) + continue; + if (agino + XFS_INODES_PER_HOLEMASK_BIT > low && + agino <= high) { + *exists = true; + return 0; + } + } + + error = xfs_btree_increment(cur, 0, &has_record); + } + return error; +} + +/* Is there an inode record covering a given extent? */ +int +xfs_ialloc_has_inodes_at_extent( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + xfs_agino_t low; + xfs_agino_t high; + + low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0); + high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1; + + return xfs_ialloc_has_inode_record(cur, low, high, exists); +} + +struct xfs_ialloc_count_inodes { + xfs_agino_t count; + xfs_agino_t freecount; +}; + +/* Record inode counts across all inobt records. */ +STATIC int +xfs_ialloc_count_inodes_rec( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xfs_ialloc_count_inodes *ci = priv; + + xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); + ci->count += irec.ir_count; + ci->freecount += irec.ir_freecount; + + return 0; +} + +/* Count allocated and free inodes under an inobt. */ +int +xfs_ialloc_count_inodes( + struct xfs_btree_cur *cur, + xfs_agino_t *count, + xfs_agino_t *freecount) +{ + struct xfs_ialloc_count_inodes ci = {0}; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); + if (error) + return error; + + *count = ci.count; + *freecount = ci.freecount; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index d2bdcd5e7312..c5402bb4ce0c 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -81,7 +81,6 @@ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ xfs_ino_t *inop); /* inode number allocated */ @@ -171,6 +170,12 @@ int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); +int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, + xfs_agblock_t bno, xfs_extlen_t len, bool *exists); +int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, + xfs_agino_t high, bool *exists); +int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, + xfs_agino_t *freecount); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 317caba9faa6..af197a5f3a82 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -141,21 +141,42 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { + if (cur->bc_mp->m_inotbt_nores) + return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); } STATIC int -xfs_inobt_free_block( +__xfs_inobt_free_block( struct xfs_btree_cur *cur, - struct xfs_buf *bp) + struct xfs_buf *bp, + enum xfs_ag_resv_type resv) { struct xfs_owner_info oinfo; xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo, XFS_AG_RESV_NONE); + &oinfo, resv); +} + +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + if (cur->bc_mp->m_inotbt_nores) + return xfs_inobt_free_block(cur, bp); + return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } STATIC int @@ -250,12 +271,13 @@ xfs_inobt_diff_two_keys( be32_to_cpu(k2->inobt.ir_startino); } -static int +static xfs_failaddr_t xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; unsigned int level; /* @@ -271,20 +293,21 @@ xfs_inobt_verify( switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): case cpu_to_be32(XFS_FIBT_CRC_MAGIC): - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): case cpu_to_be32(XFS_FIBT_MAGIC): break; default: - return 0; + return NULL; } /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= mp->m_in_maxlevels) - return false; + return __this_address; return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); } @@ -293,25 +316,30 @@ static void xfs_inobt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_inobt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_inobt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - if (!xfs_inobt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_inobt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -322,6 +350,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, }; STATIC int @@ -372,7 +401,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, - .free_block = xfs_inobt_free_block, + .free_block = xfs_finobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 89bf16b4d937..b0f31791c7e6 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -632,8 +632,6 @@ xfs_iext_insert( struct xfs_iext_leaf *new = NULL; int nr_entries, i; - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (ifp->if_height == 0) xfs_iext_alloc_root(ifp, cur); else if (ifp->if_height == 1) @@ -661,6 +659,8 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); + if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 6b7989038d75..4fe17b368316 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -32,6 +32,8 @@ #include "xfs_ialloc.h" #include "xfs_dir2.h" +#include <linux/iversion.h> + /* * Check that none of the inode's in the buffer have a next * unlinked field of 0. @@ -113,8 +115,7 @@ xfs_inode_buf_verify( return; } - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", @@ -264,7 +265,8 @@ xfs_inode_from_disk( to->di_flags = be16_to_cpu(from->di_flags); if (to->di_version == 3) { - inode->i_version = be64_to_cpu(from->di_changecount); + inode_set_iversion_queried(inode, + be64_to_cpu(from->di_changecount)); to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); @@ -314,7 +316,7 @@ xfs_inode_to_disk( to->di_flags = cpu_to_be16(from->di_flags); if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(inode->i_version); + to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); @@ -381,7 +383,7 @@ xfs_log_dinode_to_disk( } } -bool +xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, xfs_ino_t ino, @@ -390,53 +392,122 @@ xfs_dinode_verify( uint16_t mode; uint16_t flags; uint64_t flags2; + uint64_t di_size; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) - return false; + return __this_address; + + /* Verify v3 integrity information first */ + if (dip->di_version >= 3) { + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return __this_address; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + return __this_address; + if (be64_to_cpu(dip->di_ino) != ino) + return __this_address; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + } /* don't allow invalid i_size */ - if (be64_to_cpu(dip->di_size) & (1ULL << 63)) - return false; + di_size = be64_to_cpu(dip->di_size); + if (di_size & (1ULL << 63)) + return __this_address; mode = be16_to_cpu(dip->di_mode); if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) - return false; + return __this_address; /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0) - return false; + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) + return __this_address; + + /* Fork checks carried over from xfs_iformat_fork */ + if (mode && + be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks)) + return __this_address; + + if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) + return __this_address; + + flags = be16_to_cpu(dip->di_flags); + + if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) + return __this_address; + + /* Do we have appropriate data fork formats for the mode? */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (dip->di_format != XFS_DINODE_FMT_DEV) + return __this_address; + break; + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (S_ISREG(mode)) + return __this_address; + if (di_size > XFS_DFORK_DSIZE(dip, mp)) + return __this_address; + /* fall through */ + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return __this_address; + } + break; + case 0: + /* Uninitialized inode ok. */ + break; + default: + return __this_address; + } + + if (XFS_DFORK_Q(dip)) { + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return __this_address; + } + } /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) - return true; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, - XFS_DINODE_CRC_OFF)) - return false; - if (be64_to_cpu(dip->di_ino) != ino) - return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return NULL; - flags = be16_to_cpu(dip->di_flags); flags2 = be64_to_cpu(dip->di_flags2); /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && !xfs_sb_version_hasreflink(&mp->m_sb)) - return false; + return __this_address; + + /* only regular files get reflink */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (mode & S_IFMT) != S_IFREG) + return __this_address; /* don't let reflink and realtime mix */ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) - return false; + return __this_address; /* don't let reflink and dax mix */ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) - return false; + return __this_address; - return true; + return NULL; } void @@ -476,6 +547,7 @@ xfs_iread( { xfs_buf_t *bp; xfs_dinode_t *dip; + xfs_failaddr_t fa; int error; /* @@ -507,11 +579,10 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld", - __func__, ip->i_ino); - - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + fa = xfs_dinode_verify(mp, ip->i_ino, dip); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip, + sizeof(*dip), fa); error = -EFSCORRUPTED; goto out_brelse; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index a9c97a356c30..8a5e1da52d74 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -82,7 +82,7 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ -bool xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, - struct xfs_dinode *dip); +xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_dinode *dip); #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index c79a1616b79d..866d2861c625 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -35,6 +35,8 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "xfs_shared.h" kmem_zone_t *xfs_ifork_zone; @@ -62,69 +64,11 @@ xfs_iformat_fork( int error = 0; xfs_fsize_t di_size; - if (unlikely(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks))) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", - (unsigned long long)ip->i_ino, - (int)(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents)), - (unsigned long long) - be64_to_cpu(dip->di_nblocks)); - XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { - xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", - (unsigned long long)ip->i_ino, - dip->di_forkoff); - XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && - !ip->i_mount->m_rtdev_targp)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, has realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely(xfs_is_reflink_inode(ip) && !S_ISREG(inode->i_mode))) { - xfs_warn(ip->i_mount, - "corrupt dinode %llu, wrong file type for reflink.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely(xfs_is_reflink_inode(ip) && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { - xfs_warn(ip->i_mount, - "corrupt dinode %llu, has reflink+realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; - } - switch (inode->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: case S_IFSOCK: - if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { - XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } ip->i_d.di_size = 0; inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); break; @@ -134,32 +78,7 @@ xfs_iformat_fork( case S_IFDIR: switch (dip->di_format) { case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (local format for regular file).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(4)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - di_size = be64_to_cpu(dip->di_size); - if (unlikely(di_size < 0 || - di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %Ld for local inode).", - (unsigned long long) ip->i_ino, - (long long) di_size); - XFS_CORRUPTION_ERROR("xfs_iformat(5)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - size = (int)di_size; error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); break; @@ -170,28 +89,16 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); break; default: - XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, - ip->i_mount); return -EFSCORRUPTED; } break; default: - XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); return -EFSCORRUPTED; } if (error) return error; - /* Check inline dir contents. */ - if (S_ISDIR(inode->i_mode) && dip->di_format == XFS_DINODE_FMT_LOCAL) { - error = xfs_dir2_sf_verify(ip); - if (error) { - xfs_idestroy_fork(ip, XFS_DATA_FORK); - return error; - } - } - if (xfs_is_reflink_inode(ip)) { ASSERT(ip->i_cowfp == NULL); xfs_ifork_init_cow(ip); @@ -208,18 +115,6 @@ xfs_iformat_fork( atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); size = be16_to_cpu(atp->hdr.totsize); - if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad attr fork size %Ld).", - (unsigned long long) ip->i_ino, - (long long) size); - XFS_CORRUPTION_ERROR("xfs_iformat(8)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - error = -EFSCORRUPTED; - break; - } - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); break; case XFS_DINODE_FMT_EXTENTS: @@ -403,6 +298,7 @@ xfs_iformat_btree( */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= XFS_IFORK_MAXEXT(ip, whichfork) || + nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || @@ -827,3 +723,45 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } + +/* Default fork content verifiers. */ +struct xfs_ifork_ops xfs_default_ifork_ops = { + .verify_attr = xfs_attr_shortform_verify, + .verify_dir = xfs_dir2_sf_verify, + .verify_symlink = xfs_symlink_shortform_verify, +}; + +/* Verify the inline contents of the data fork of an inode. */ +xfs_failaddr_t +xfs_ifork_verify_data( + struct xfs_inode *ip, + struct xfs_ifork_ops *ops) +{ + /* Non-local data fork, we're done. */ + if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return NULL; + + /* Check the inline data fork if there is one. */ + switch (VFS_I(ip)->i_mode & S_IFMT) { + case S_IFDIR: + return ops->verify_dir(ip); + case S_IFLNK: + return ops->verify_symlink(ip); + default: + return NULL; + } +} + +/* Verify the inline contents of the attr fork of an inode. */ +xfs_failaddr_t +xfs_ifork_verify_attr( + struct xfs_inode *ip, + struct xfs_ifork_ops *ops) +{ + /* There has to be an attr fork allocated if aformat is local. */ + if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return NULL; + if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK)) + return __this_address; + return ops->verify_attr(ip); +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index b9f0098e33b8..dd8aba0dd119 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -186,4 +186,18 @@ extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); +typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *); + +struct xfs_ifork_ops { + xfs_ifork_verifier_t verify_symlink; + xfs_ifork_verifier_t verify_dir; + xfs_ifork_verifier_t verify_attr; +}; +extern struct xfs_ifork_ops xfs_default_ifork_ops; + +xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip, + struct xfs_ifork_ops *ops); +xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip, + struct xfs_ifork_ops *ops); + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index c10597973333..cc4cbe290939 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -55,7 +55,7 @@ xfs_log_calc_max_attrsetm_res( * the maximum one in terms of the pre-calculated values which were done * at mount time. */ -STATIC void +void xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index d69c772271cb..bb1b13a9b5f4 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -112,8 +112,6 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ #define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ #define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */ @@ -153,8 +151,11 @@ typedef uint16_t xfs_qwarncnt_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) -extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, const char *str); +extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type, + uint flags); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); +extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, + xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 585b35d34142..bee68c23d612 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1488,27 +1488,12 @@ __xfs_refcount_cow_alloc( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Add refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); - if (error) - return error; - - /* Add rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* @@ -1521,27 +1506,12 @@ __xfs_refcount_cow_free( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Remove refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_FREE, dfops); - if (error) - return error; - - /* Remove rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_free_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* Record a CoW staging extent in the refcount btree. */ @@ -1552,11 +1522,19 @@ xfs_refcount_alloc_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, fsb, len); + if (error) + return error; + + /* Add rmap entry */ + return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ @@ -1567,9 +1545,17 @@ xfs_refcount_free_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; + /* Remove rmap entry */ + error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + if (error) + return error; + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, fsb, len); } @@ -1710,3 +1696,22 @@ out_cursor: xfs_trans_brelse(tp, agbp); goto out_trans; } + +/* Is there a record covering a given extent? */ +int +xfs_refcount_has_record( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + union xfs_btree_irec low; + union xfs_btree_irec high; + + memset(&low, 0, sizeof(low)); + low.rc.rc_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rc.rc_startblock = bno + len - 1; + + return xfs_btree_has_record(cur, &low, &high, exists); +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index eafb9d1f3b37..2a731ac68fe4 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -83,4 +83,7 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; } +extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, + xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 3c59dd3d58d7..8479769e470d 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -223,29 +223,31 @@ xfs_refcountbt_diff_two_keys( be32_to_cpu(k2->refc.rc_startblock); } -STATIC bool +STATIC xfs_failaddr_t xfs_refcountbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; + xfs_failaddr_t fa; unsigned int level; if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) - return false; + return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return false; - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + return __this_address; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; level = be16_to_cpu(block->bb_level); if (pag && pag->pagf_init) { if (level >= pag->pagf_refcount_level) - return false; + return __this_address; } else if (level >= mp->m_refc_maxlevels) - return false; + return __this_address; return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); } @@ -254,25 +256,30 @@ STATIC void xfs_refcountbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_refcountbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_refcountbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } STATIC void xfs_refcountbt_write_verify( struct xfs_buf *bp) { - if (!xfs_refcountbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_refcountbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -283,6 +290,7 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, + .verify_struct = xfs_refcountbt_verify, }; STATIC int diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index dd019cee1b3b..79822cf6ebe3 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -368,6 +368,51 @@ xfs_rmap_lookup_le_range( } /* + * Perform all the relevant owner checks for a removal op. If we're doing an + * unknown-owner removal then we have no owner information to check. + */ +static int +xfs_rmap_free_check_owner( + struct xfs_mount *mp, + uint64_t ltoff, + struct xfs_rmap_irec *rec, + xfs_fsblock_t bno, + xfs_filblks_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int error = 0; + + if (owner == XFS_RMAP_OWN_UNKNOWN) + return 0; + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (rec->rm_flags & XFS_RMAP_UNWRITTEN), out); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out); + + /* Check the offset, if necessary. */ + if (XFS_RMAP_NON_INODE_OWNER(owner)) + goto out; + + if (flags & XFS_RMAP_BMBT_BLOCK) { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK, + out); + } else { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out); + XFS_WANT_CORRUPTED_GOTO(mp, + ltoff + rec->rm_blockcount >= offset + len, + out); + } + +out: + return error; +} + +/* * Find the extent in the rmap btree and remove it. * * The record we find should always be an exact match for the extent that we're @@ -444,33 +489,40 @@ xfs_rmap_unmap( goto out_done; } - /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + /* + * If we're doing an unknown-owner removal for EFI recovery, we expect + * to find the full range in the rmapbt or nothing at all. If we + * don't find any rmaps overlapping either end of the range, we're + * done. Hopefully this means that the EFI creator already queued + * (and finished) a RUI to remove the rmap. + */ + if (owner == XFS_RMAP_OWN_UNKNOWN && + ltrec.rm_startblock + ltrec.rm_blockcount <= bno) { + struct xfs_rmap_irec rtrec; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + if (i == 0) + goto out_done; + error = xfs_rmap_get_rec(cur, &rtrec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (rtrec.rm_startblock >= bno + len) + goto out_done; + } /* Make sure the extent we found covers the entire freeing range. */ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); - - /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || - XFS_RMAP_NON_INODE_OWNER(owner), out_error); + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); - /* Check the offset, if necessary. */ - if (!XFS_RMAP_NON_INODE_OWNER(owner)) { - if (flags & XFS_RMAP_BMBT_BLOCK) { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK, - out_error); - } else { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_offset <= offset, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, - ltoff + ltrec.rm_blockcount >= offset + len, - out_error); - } - } + /* Check owner information. */ + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + offset, flags); + if (error) + goto out_error; if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ @@ -664,6 +716,7 @@ xfs_rmap_map( flags |= XFS_RMAP_UNWRITTEN; trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, unwritten, oinfo); + ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* * For the initial lookup, look for an exact match or the left-adjacent @@ -2334,3 +2387,70 @@ xfs_rmap_compare( else return 0; } + +/* Is there a record covering a given extent? */ +int +xfs_rmap_has_record( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + union xfs_btree_irec low; + union xfs_btree_irec high; + + memset(&low, 0, sizeof(low)); + low.r.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.r.rm_startblock = bno + len - 1; + + return xfs_btree_has_record(cur, &low, &high, exists); +} + +/* + * Is there a record for this owner completely covering a given physical + * extent? If so, *has_rmap will be set to true. If there is no record + * or the record only covers part of the range, we set *has_rmap to false. + * This function doesn't perform range lookups or offset checks, so it is + * not suitable for checking data fork blocks. + */ +int +xfs_rmap_record_exists( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool *has_rmap) +{ + uint64_t owner; + uint64_t offset; + unsigned int flags; + int has_record; + struct xfs_rmap_irec irec; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & XFS_RMAP_BMBT_BLOCK)); + + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + &has_record); + if (error) + return error; + if (!has_record) { + *has_rmap = false; + return 0; + } + + error = xfs_rmap_get_rec(cur, &irec, &has_record); + if (error) + return error; + if (!has_record) { + *has_rmap = false; + return 0; + } + + *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && + irec.rm_startblock + irec.rm_blockcount >= bno + len); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 466ede637080..380e53be98d5 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -61,7 +61,21 @@ static inline void xfs_rmap_skip_owner_update( struct xfs_owner_info *oi) { - oi->oi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL); +} + +static inline bool +xfs_rmap_should_skip_owner_update( + struct xfs_owner_info *oi) +{ + return oi->oi_owner == XFS_RMAP_OWN_NULL; +} + +static inline void +xfs_rmap_any_owner_update( + struct xfs_owner_info *oi) +{ + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN); } /* Reverse mapping functions. */ @@ -219,5 +233,10 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a, union xfs_btree_rec; int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); +int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); +int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, struct xfs_owner_info *oinfo, + bool *has_rmap); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9d9c9192584c..e829c3e489ea 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -303,13 +303,14 @@ xfs_rmapbt_diff_two_keys( return 0; } -static bool +static xfs_failaddr_t xfs_rmapbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; + xfs_failaddr_t fa; unsigned int level; /* @@ -325,19 +326,20 @@ xfs_rmapbt_verify( * in this case. */ if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) - return false; + return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) - return false; - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + return __this_address; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; level = be16_to_cpu(block->bb_level); if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) - return false; + return __this_address; } else if (level >= mp->m_rmap_maxlevels) - return false; + return __this_address; return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); } @@ -346,25 +348,30 @@ static void xfs_rmapbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_rmapbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rmapbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_rmapbt_write_verify( struct xfs_buf *bp) { - if (!xfs_rmapbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_rmapbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -375,6 +382,7 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, + .verify_struct = xfs_rmapbt_verify, }; STATIC int diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 3fb29a5ea915..106be2d0bb88 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1097,3 +1097,24 @@ xfs_verify_rtbno( { return rtbno < mp->m_sb.sb_rblocks; } + +/* Is the given extent all free? */ +int +xfs_rtalloc_extent_is_free( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_rtblock_t start, + xfs_extlen_t len, + bool *is_free) +{ + xfs_rtblock_t end; + int matches; + int error; + + error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches); + if (error) + return error; + + *is_free = matches; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 9b5aae2bcc0b..46af6aa60a8e 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -40,6 +40,8 @@ #include "xfs_rmap_btree.h" #include "xfs_bmap.h" #include "xfs_refcount_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -116,6 +118,9 @@ xfs_mount_validate_sb( bool check_inprogress, bool check_version) { + u32 agcount = 0; + u32 rem; + if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; @@ -226,6 +231,13 @@ xfs_mount_validate_sb( return -EINVAL; } + /* Compute agcount for this number of dblocks and agblocks */ + if (sbp->sb_agblocks) { + agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem); + if (rem) + agcount++; + } + /* * More sanity checking. Most of these were stolen directly from * xfs_repair. @@ -250,6 +262,10 @@ xfs_mount_validate_sb( sbp->sb_inodesize != (1 << sbp->sb_inodelog) || sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || + XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || + sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1 || + agcount == 0 || agcount != sbp->sb_agcount || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || @@ -640,11 +656,10 @@ xfs_sb_read_verify( error = xfs_sb_verify(bp, true); out_error: - if (error) { + if (error == -EFSCORRUPTED || error == -EFSBADCRC) + xfs_verifier_error(bp, error, __this_address); + else if (error) xfs_buf_ioerror(bp, error); - if (error == -EFSCORRUPTED || error == -EFSBADCRC) - xfs_verifier_error(bp); - } } /* @@ -673,13 +688,12 @@ xfs_sb_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; int error; error = xfs_sb_verify(bp, false); if (error) { - xfs_buf_ioerror(bp, error); - xfs_verifier_error(bp); + xfs_verifier_error(bp, error, __this_address); return; } @@ -876,3 +890,88 @@ xfs_sync_sb( xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } + +int +xfs_fs_geometry( + struct xfs_sb *sbp, + struct xfs_fsop_geom *geo, + int struct_version) +{ + memset(geo, 0, sizeof(struct xfs_fsop_geom)); + + geo->blocksize = sbp->sb_blocksize; + geo->rtextsize = sbp->sb_rextsize; + geo->agblocks = sbp->sb_agblocks; + geo->agcount = sbp->sb_agcount; + geo->logblocks = sbp->sb_logblocks; + geo->sectsize = sbp->sb_sectsize; + geo->inodesize = sbp->sb_inodesize; + geo->imaxpct = sbp->sb_imax_pct; + geo->datablocks = sbp->sb_dblocks; + geo->rtblocks = sbp->sb_rblocks; + geo->rtextents = sbp->sb_rextents; + geo->logstart = sbp->sb_logstart; + BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid)); + memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid)); + + if (struct_version < 2) + return 0; + + geo->sunit = sbp->sb_unit; + geo->swidth = sbp->sb_width; + + if (struct_version < 3) + return 0; + + geo->version = XFS_FSOP_GEOM_VERSION; + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2; + if (xfs_sb_version_hasattr(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; + if (xfs_sb_version_hasquota(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; + if (xfs_sb_version_hasalign(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; + if (xfs_sb_version_hasdalign(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; + if (xfs_sb_version_hasextflgbit(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG; + if (xfs_sb_version_hassector(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; + if (xfs_sb_version_hasasciici(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; + if (xfs_sb_version_haslazysbcount(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; + if (xfs_sb_version_hasattr2(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; + if (xfs_sb_version_hasprojid32bit(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; + if (xfs_sb_version_hascrc(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; + if (xfs_sb_version_hasftype(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; + if (xfs_sb_version_hasfinobt(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; + if (xfs_sb_version_hassparseinodes(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; + if (xfs_sb_version_hasrmapbt(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; + if (xfs_sb_version_hasreflink(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; + if (xfs_sb_version_hassector(sbp)) + geo->logsectsize = sbp->sb_logsectsize; + else + geo->logsectsize = BBSIZE; + geo->rtsectsize = sbp->sb_blocksize; + geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); + + if (struct_version < 4) + return 0; + + if (xfs_sb_version_haslogv2(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; + + geo->logsunit = sbp->sb_logsunit; + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 961e6475a309..63dcd2a1a657 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -34,4 +34,8 @@ extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +#define XFS_FS_GEOM_MAX_STRUCT_VER (4) +extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, + int struct_version); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c6f4eb46fe26..d0b84da0cb1e 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -76,6 +76,9 @@ struct xfs_log_item_desc { int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); +struct xfs_trans_res; +void xfs_log_get_max_trans_res(struct xfs_mount *mp, + struct xfs_trans_res *max_resp); /* * Values for t_flags. @@ -143,5 +146,6 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); +xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index c484877129a0..5ef5f354587e 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -98,7 +98,7 @@ xfs_symlink_hdr_ok( return true; } -static bool +static xfs_failaddr_t xfs_symlink_verify( struct xfs_buf *bp) { @@ -106,22 +106,22 @@ xfs_symlink_verify( struct xfs_dsymlink_hdr *dsl = bp->b_addr; if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; + return __this_address; if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) - return false; + return __this_address; if (be32_to_cpu(dsl->sl_offset) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) - return false; + return __this_address; if (dsl->sl_owner == 0) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn))) - return false; + return __this_address; - return true; + return NULL; } static void @@ -129,18 +129,19 @@ xfs_symlink_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_symlink_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_symlink_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -148,15 +149,16 @@ xfs_symlink_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_symlink_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_symlink_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -171,6 +173,7 @@ const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, + .verify_struct = xfs_symlink_verify, }; void @@ -207,3 +210,37 @@ xfs_symlink_local_to_remote( xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } + +/* Verify the consistency of an inline symlink. */ +xfs_failaddr_t +xfs_symlink_shortform_verify( + struct xfs_inode *ip) +{ + char *sfp; + char *endp; + struct xfs_ifork *ifp; + int size; + + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + sfp = (char *)ifp->if_u1.if_data; + size = ifp->if_bytes; + endp = sfp + size; + + /* Zero length symlinks can exist while we're deleting a remote one. */ + if (size == 0) + return NULL; + + /* No negative sizes or overly long symlink targets. */ + if (size < 0 || size > XFS_SYMLINK_MAXLEN) + return __this_address; + + /* No NULLs in the target either. */ + if (memchr(sfp, 0, size - 1)) + return __this_address; + + /* We /did/ null-terminate the buffer, right? */ + if (*endp != 0) + return __this_address; + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6bd916bd35e2..5f17641f040f 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -34,6 +34,9 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" +#define _ALLOC true +#define _FREE false + /* * A buffer has a format structure overhead in the log in addition * to the data, so we need to take this into account when reserving @@ -132,43 +135,77 @@ xfs_calc_inode_res( } /* - * The free inode btree is a conditional feature and the log reservation - * requirements differ slightly from that of the traditional inode allocation - * btree. The finobt tracks records for inode chunks with at least one free - * inode. A record can be removed from the tree for an inode allocation - * or free and thus the finobt reservation is unconditional across: + * Inode btree record insertion/removal modifies the inode btree and free space + * btrees (since the inobt does not use the agfl). This requires the following + * reservation: * - * - inode allocation - * - inode free - * - inode chunk allocation + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size * - * The 'modify' param indicates to include the record modification scenario. The - * 'alloc' param indicates to include the reservation for free space btree - * modifications on behalf of finobt modifications. This is required only for - * transactions that do not already account for free space btree modifications. + * The caller must account for SB and AG header modifications, etc. + */ +STATIC uint +xfs_calc_inobt_res( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * The free inode btree is a conditional feature. The behavior differs slightly + * from that of the traditional inode btree in that the finobt tracks records + * for inode chunks with at least one free inode. A record can be removed from + * the tree during individual inode allocation. Therefore the finobt + * reservation is unconditional for both the inode chunk allocation and + * individual inode allocation (modify) cases. * - * the free inode btree: max depth * block size - * the allocation btrees: 2 trees * (max depth - 1) * block size - * the free inode btree entry: block size + * Behavior aside, the reservation for finobt modification is equivalent to the + * traditional inobt: cover a full finobt shape change plus block allocation. */ STATIC uint xfs_calc_finobt_res( - struct xfs_mount *mp, - int alloc, - int modify) + struct xfs_mount *mp) { - uint res; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return 0; - res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); - if (alloc) - res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); - if (modify) - res += (uint)XFS_FSB_TO_B(mp, 1); + return xfs_calc_inobt_res(mp); +} +/* + * Calculate the reservation required to allocate or free an inode chunk. This + * includes: + * + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the inode chunk: m_ialloc_blks * N + * + * The size N of the inode chunk reservation depends on whether it is for + * allocation or free and which type of create transaction is in use. An inode + * chunk free always invalidates the buffers and only requires reservation for + * headers (N == 0). An inode chunk allocation requires a chunk sized + * reservation on v4 and older superblocks to initialize the chunk. No chunk + * reservation is required for allocation on v5 supers, which use ordered + * buffers to initialize. + */ +STATIC uint +xfs_calc_inode_chunk_res( + struct xfs_mount *mp, + bool alloc) +{ + uint res, size = 0; + + res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (alloc) { + /* icreate tx uses ordered buffers */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return res; + size = XFS_FSB_TO_B(mp, 1); + } + + res += xfs_calc_buf_res(mp->m_ialloc_blks, size); return res; } @@ -232,8 +269,6 @@ xfs_calc_write_reservation( * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( @@ -245,12 +280,7 @@ xfs_calc_itruncate_reservation( XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(5, 0) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(2 + mp->m_ialloc_blks + - mp->m_in_maxlevels, 0))); + XFS_FSB_TO_B(mp, 1)))); } /* @@ -282,13 +312,14 @@ xfs_calc_rename_reservation( * For removing an inode from unlinked list at first, we can modify: * the agi hash list and counters: sector size * the on disk inode before ours in the agi hash list: inode cluster size + * the on disk inode in the agi hash list: inode cluster size */ STATIC uint xfs_calc_iunlink_remove_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); + 2 * max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); } /* @@ -320,13 +351,13 @@ xfs_calc_link_reservation( /* * For adding an inode to unlinked list we can modify: * the agi hash list: sector size - * the unlinked inode: inode size + * the on disk inode: inode cluster size */ STATIC uint xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) { return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_inode_res(mp, 1); + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); } /* @@ -379,45 +410,16 @@ xfs_calc_create_resv_modify( xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + (uint)XFS_FSB_TO_B(mp, 1) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + - xfs_calc_finobt_res(mp, 1, 1); -} - -/* - * For create we can allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: mp->m_ialloc_blks * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_create_resv_alloc( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - mp->m_sb.sb_sectsize + - xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -STATIC uint -__xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX(xfs_calc_create_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); + xfs_calc_finobt_res(mp); } /* * For icreate we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - * the finobt (record insertion) + * the inode chunk (allocation, optional init) + * the inobt (record insertion) + * the finobt (optional, record insertion) */ STATIC uint xfs_calc_icreate_resv_alloc( @@ -425,10 +427,9 @@ xfs_calc_icreate_resv_alloc( { return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_finobt_res(mp, 0, 0); + xfs_calc_inode_chunk_res(mp, _ALLOC) + + xfs_calc_inobt_res(mp) + + xfs_calc_finobt_res(mp); } STATIC uint @@ -440,26 +441,12 @@ xfs_calc_icreate_reservation(xfs_mount_t *mp) } STATIC uint -xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return xfs_calc_icreate_reservation(mp); - return __xfs_calc_create_reservation(mp); - -} - -STATIC uint xfs_calc_create_tmpfile_reservation( struct xfs_mount *mp) { uint res = XFS_DQUOT_LOGRES(mp); - if (xfs_sb_version_hascrc(&mp->m_sb)) - res += xfs_calc_icreate_resv_alloc(mp); - else - res += xfs_calc_create_resv_alloc(mp); - + res += xfs_calc_icreate_resv_alloc(mp); return res + xfs_calc_iunlink_add_reservation(mp); } @@ -470,7 +457,7 @@ STATIC uint xfs_calc_mkdir_reservation( struct xfs_mount *mp) { - return xfs_calc_create_reservation(mp); + return xfs_calc_icreate_reservation(mp); } @@ -483,20 +470,24 @@ STATIC uint xfs_calc_symlink_reservation( struct xfs_mount *mp) { - return xfs_calc_create_reservation(mp) + + return xfs_calc_icreate_reservation(mp) + xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN); } /* * In freeing an inode we can modify: * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size + * the super block free inode counter, AGF and AGFL: sector size + * the on disk inode (agi unlinked list removal) + * the inode chunk (invalidated, headers only) + * the inode btree * the finobt (record insertion, removal or modification) + * + * Note that the inode chunk res. includes an allocfree res. for freeing of the + * inode chunk. This is technically extraneous because the inode chunk free is + * deferred (it occurs after a transaction roll). Include the extra reservation + * anyways since we've had reports of ifree transaction overruns due to too many + * agfl fixups during inode chunk frees. */ STATIC uint xfs_calc_ifree_reservation( @@ -504,15 +495,11 @@ xfs_calc_ifree_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_iunlink_remove_reservation(mp) + - xfs_calc_buf_res(1, 0) + - xfs_calc_buf_res(2 + mp->m_ialloc_blks + - mp->m_in_maxlevels, 0) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_finobt_res(mp, 0, 1); + xfs_calc_inode_chunk_res(mp, _FREE) + + xfs_calc_inobt_res(mp) + + xfs_calc_finobt_res(mp); } /* @@ -842,7 +829,7 @@ xfs_trans_resv_calc( resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp); resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 2a9b4f9e93c6..fd975524f460 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -32,30 +32,17 @@ #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" +#include "xfs_rmap.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" /* - * Set up scrub to check all the static metadata in each AG. - * This means the SB, AGF, AGI, and AGFL headers. + * Walk all the blocks in the AGFL. The fn function can return any negative + * error code or XFS_BTREE_QUERY_RANGE_ABORT. */ int -xfs_scrub_setup_ag_header( - struct xfs_scrub_context *sc, - struct xfs_inode *ip) -{ - struct xfs_mount *mp = sc->mp; - - if (sc->sm->sm_agno >= mp->m_sb.sb_agcount || - sc->sm->sm_ino || sc->sm->sm_gen) - return -EINVAL; - return xfs_scrub_setup_fs(sc, ip); -} - -/* Walk all the blocks in the AGFL. */ -int xfs_scrub_walk_agfl( struct xfs_scrub_context *sc, int (*fn)(struct xfs_scrub_context *, @@ -115,6 +102,36 @@ xfs_scrub_walk_agfl( /* Superblock */ +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_superblock_xref( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sm->sm_agno; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_SB_BLOCK(mp); + + error = xfs_scrub_ag_init(sc, agno, &sc->sa); + if (!xfs_scrub_xref_process_error(sc, agno, agbno, &error)) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + + /* scrub teardown will take care of sc->sa for us */ +} + /* * Scrub the filesystem superblock. * @@ -143,6 +160,22 @@ xfs_scrub_superblock( error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + /* + * The superblock verifier can return several different error codes + * if it thinks the superblock doesn't look right. For a mount these + * would all get bounced back to userspace, but if we're here then the + * fs mounted successfully, which means that this secondary superblock + * is simply incorrect. Treat all these codes the same way we treat + * any corruption. + */ + switch (error) { + case -EINVAL: /* also -EWRONGFS */ + case -ENOSYS: + case -EFBIG: + error = -EFSCORRUPTED; + default: + break; + } if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) return error; @@ -387,11 +420,175 @@ xfs_scrub_superblock( BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) xfs_scrub_block_set_corrupt(sc, bp); + xfs_scrub_superblock_xref(sc, bp); + return error; } /* AGF */ +/* Tally freespace record lengths. */ +STATIC int +xfs_scrub_agf_record_bno_lengths( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv) +{ + xfs_extlen_t *blocks = priv; + + (*blocks) += rec->ar_blockcount; + return 0; +} + +/* Check agf_freeblks */ +static inline void +xfs_scrub_agf_xref_freeblks( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_extlen_t blocks = 0; + int error; + + if (!sc->sa.bno_cur) + return; + + error = xfs_alloc_query_all(sc->sa.bno_cur, + xfs_scrub_agf_record_bno_lengths, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + if (blocks != be32_to_cpu(agf->agf_freeblks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Cross reference the AGF with the cntbt (freespace by length btree) */ +static inline void +xfs_scrub_agf_xref_cntbt( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_agblock_t agbno; + xfs_extlen_t blocks; + int have; + int error; + + if (!sc->sa.cnt_cur) + return; + + /* Any freespace at all? */ + error = xfs_alloc_lookup_le(sc->sa.cnt_cur, 0, -1U, &have); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + if (!have) { + if (agf->agf_freeblks != be32_to_cpu(0)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); + return; + } + + /* Check agf_longest */ + error = xfs_alloc_get_rec(sc->sa.cnt_cur, &agbno, &blocks, &have); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + if (!have || blocks != be32_to_cpu(agf->agf_longest)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Check the btree block counts in the AGF against the btrees. */ +STATIC void +xfs_scrub_agf_xref_btreeblks( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_mount *mp = sc->mp; + xfs_agblock_t blocks; + xfs_agblock_t btreeblks; + int error; + + /* Check agf_rmap_blocks; set up for agf_btreeblks check */ + if (sc->sa.rmap_cur) { + error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + btreeblks = blocks - 1; + if (blocks != be32_to_cpu(agf->agf_rmap_blocks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); + } else { + btreeblks = 0; + } + + /* + * No rmap cursor; we can't xref if we have the rmapbt feature. + * We also can't do it if we're missing the free space btree cursors. + */ + if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) || + !sc->sa.bno_cur || !sc->sa.cnt_cur) + return; + + /* Check agf_btreeblks */ + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + btreeblks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + btreeblks += blocks - 1; + + if (btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Check agf_refcount_blocks against tree size */ +static inline void +xfs_scrub_agf_xref_refcblks( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_agblock_t blocks; + int error; + + if (!sc->sa.refc_cur) + return; + + error = xfs_btree_count_blocks(sc->sa.refc_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (blocks != be32_to_cpu(agf->agf_refcount_blocks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agf_xref( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGF_BLOCK(mp); + + error = xfs_scrub_ag_btcur_init(sc, &sc->sa); + if (error) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_agf_xref_freeblks(sc); + xfs_scrub_agf_xref_cntbt(sc); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_agf_xref_btreeblks(sc); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + xfs_scrub_agf_xref_refcblks(sc); + + /* scrub teardown will take care of sc->sa for us */ +} + /* Scrub the AGF. */ int xfs_scrub_agf( @@ -414,6 +611,7 @@ xfs_scrub_agf( &sc->sa.agf_bp, &sc->sa.agfl_bp); if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error)) goto out; + xfs_scrub_buffer_recheck(sc, sc->sa.agf_bp); agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); @@ -470,6 +668,7 @@ xfs_scrub_agf( if (agfl_count != 0 && fl_count != agfl_count) xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + xfs_scrub_agf_xref(sc); out: return error; } @@ -477,11 +676,28 @@ out: /* AGFL */ struct xfs_scrub_agfl_info { + struct xfs_owner_info oinfo; unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; }; +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agfl_block_xref( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + struct xfs_owner_info *oinfo) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); +} + /* Scrub an AGFL block. */ STATIC int xfs_scrub_agfl_block( @@ -499,6 +715,8 @@ xfs_scrub_agfl_block( else xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp); + xfs_scrub_agfl_block_xref(sc, agbno, priv); + return 0; } @@ -513,6 +731,37 @@ xfs_scrub_agblock_cmp( return (int)*a - (int)*b; } +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agfl_xref( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGFL_BLOCK(mp); + + error = xfs_scrub_ag_btcur_init(sc, &sc->sa); + if (error) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + + /* + * Scrub teardown will take care of sc->sa for us. Leave sc->sa + * active so that the agfl block xref can use it too. + */ +} + /* Scrub the AGFL. */ int xfs_scrub_agfl( @@ -532,6 +781,12 @@ xfs_scrub_agfl( goto out; if (!sc->sa.agf_bp) return -EFSCORRUPTED; + xfs_scrub_buffer_recheck(sc, sc->sa.agfl_bp); + + xfs_scrub_agfl_xref(sc); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); @@ -548,6 +803,7 @@ xfs_scrub_agfl( } /* Check the blocks in the AGFL. */ + xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); if (error) goto out_free; @@ -575,6 +831,56 @@ out: /* AGI */ +/* Check agi_count/agi_freecount */ +static inline void +xfs_scrub_agi_xref_icounts( + struct xfs_scrub_context *sc) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + xfs_agino_t icount; + xfs_agino_t freecount; + int error; + + if (!sc->sa.ino_cur) + return; + + error = xfs_ialloc_count_inodes(sc->sa.ino_cur, &icount, &freecount); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (be32_to_cpu(agi->agi_count) != icount || + be32_to_cpu(agi->agi_freecount) != freecount) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agi_bp); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agi_xref( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGI_BLOCK(mp); + + error = xfs_scrub_ag_btcur_init(sc, &sc->sa); + if (error) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_scrub_agi_xref_icounts(sc); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + + /* scrub teardown will take care of sc->sa for us */ +} + /* Scrub the AGI. */ int xfs_scrub_agi( @@ -598,6 +904,7 @@ xfs_scrub_agi( &sc->sa.agf_bp, &sc->sa.agfl_bp); if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error)) goto out; + xfs_scrub_buffer_recheck(sc, sc->sa.agi_bp); agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); @@ -653,6 +960,7 @@ xfs_scrub_agi( if (agi->agi_pad32 != cpu_to_be32(0)) xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + xfs_scrub_agi_xref(sc); out: return error; } diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 059663e13414..517c079d3f68 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -31,6 +31,7 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -49,6 +50,64 @@ xfs_scrub_setup_ag_allocbt( } /* Free space btree scrubber. */ +/* + * Ensure there's a corresponding cntbt/bnobt record matching this + * bnobt/cntbt record, respectively. + */ +STATIC void +xfs_scrub_allocbt_xref_other( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_btree_cur **pcur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int has_otherrec; + int error; + + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + pcur = &sc->sa.cnt_cur; + else + pcur = &sc->sa.bno_cur; + if (!*pcur) + return; + + error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec); + if (!xfs_scrub_should_check_xref(sc, &error, pcur)) + return; + if (!has_otherrec) { + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); + return; + } + + error = xfs_alloc_get_rec(*pcur, &fbno, &flen, &has_otherrec); + if (!xfs_scrub_should_check_xref(sc, &error, pcur)) + return; + if (!has_otherrec) { + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); + return; + } + + if (fbno != agbno || flen != len) + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_allocbt_xref( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_allocbt_xref_other(sc, agbno, len); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len); + xfs_scrub_xref_has_no_owner(sc, agbno, len); + xfs_scrub_xref_is_not_shared(sc, agbno, len); +} /* Scrub a bnobt/cntbt record. */ STATIC int @@ -70,6 +129,8 @@ xfs_scrub_allocbt_rec( !xfs_verify_agbno(mp, agno, bno + len - 1)) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + xfs_scrub_allocbt_xref(bs->sc, bno, len); + return error; } @@ -100,3 +161,23 @@ xfs_scrub_cntbt( { return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT); } + +/* xref check that the extent is not free */ +void +xfs_scrub_xref_is_used_space( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + bool is_freesp; + int error; + + if (!sc->sa.bno_cur) + return; + + error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + if (is_freesp) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0); +} diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 42fec0bcd9e1..d00282130492 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -37,6 +37,7 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -99,6 +100,201 @@ struct xfs_scrub_bmap_info { int whichfork; }; +/* Look for a corresponding rmap for this irec. */ +static inline bool +xfs_scrub_bmap_get_rmap( + struct xfs_scrub_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno, + uint64_t owner, + struct xfs_rmap_irec *rmap) +{ + xfs_fileoff_t offset; + unsigned int rflags = 0; + int has_rmap; + int error; + + if (info->whichfork == XFS_ATTR_FORK) + rflags |= XFS_RMAP_ATTR_FORK; + + /* + * CoW staging extents are owned (on disk) by the refcountbt, so + * their rmaps do not have offsets. + */ + if (info->whichfork == XFS_COW_FORK) + offset = 0; + else + offset = irec->br_startoff; + + /* + * If the caller thinks this could be a shared bmbt extent (IOWs, + * any data fork extent of a reflink inode) then we have to use the + * range rmap lookup to make sure we get the correct owner/offset. + */ + if (info->is_shared) { + error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); + if (!xfs_scrub_should_check_xref(info->sc, &error, + &info->sc->sa.rmap_cur)) + return false; + goto out; + } + + /* + * Otherwise, use the (faster) regular lookup. + */ + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner, + offset, rflags, &has_rmap); + if (!xfs_scrub_should_check_xref(info->sc, &error, + &info->sc->sa.rmap_cur)) + return false; + if (!has_rmap) + goto out; + + error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap); + if (!xfs_scrub_should_check_xref(info->sc, &error, + &info->sc->sa.rmap_cur)) + return false; + +out: + if (!has_rmap) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return has_rmap; +} + +/* Make sure that we have rmapbt records for this extent. */ +STATIC void +xfs_scrub_bmap_xref_rmap( + struct xfs_scrub_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner; + + if (!info->sc->sa.rmap_cur) + return; + + if (info->whichfork == XFS_COW_FORK) + owner = XFS_RMAP_OWN_COW; + else + owner = info->sc->ip->i_ino; + + /* Find the rmap record for this irec. */ + if (!xfs_scrub_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; + + /* Check the rmap. */ + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap.rm_startblock > agbno || + agbno + irec->br_blockcount > rmap_end) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check the logical offsets if applicable. CoW staging extents + * don't track logical offsets since the mappings only exist in + * memory. + */ + if (info->whichfork != XFS_COW_FORK) { + rmap_end = (unsigned long long)rmap.rm_offset + + rmap.rm_blockcount; + if (rmap.rm_offset > irec->br_startoff || + irec->br_startoff + irec->br_blockcount > rmap_end) + xfs_scrub_fblock_xref_set_corrupt(info->sc, + info->whichfork, irec->br_startoff); + } + + if (rmap.rm_owner != owner) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check for discrepancies between the unwritten flag in the irec and + * the rmap. Note that the (in-memory) CoW fork distinguishes between + * unwritten and written extents, but we don't track that in the rmap + * records because the blocks are owned (on-disk) by the refcountbt, + * which doesn't track unwritten state. + */ + if (owner != XFS_RMAP_OWN_COW && + irec->br_state == XFS_EXT_UNWRITTEN && + !(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (info->whichfork == XFS_ATTR_FORK && + !(rmap.rm_flags & XFS_RMAP_ATTR_FORK)) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + +/* Cross-reference a single rtdev extent record. */ +STATIC void +xfs_scrub_bmap_rt_extent_xref( + struct xfs_scrub_bmap_info *info, + struct xfs_inode *ip, + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *irec) +{ + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_rt_space(info->sc, irec->br_startblock, + irec->br_blockcount); +} + +/* Cross-reference a single datadev extent record. */ +STATIC void +xfs_scrub_bmap_extent_xref( + struct xfs_scrub_bmap_info *info, + struct xfs_inode *ip, + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t len; + int error; + + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); + len = irec->br_blockcount; + + error = xfs_scrub_ag_init(info->sc, agno, &info->sc->sa); + if (!xfs_scrub_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + return; + + xfs_scrub_xref_is_used_space(info->sc, agbno, len); + xfs_scrub_xref_is_not_inode_chunk(info->sc, agbno, len); + xfs_scrub_bmap_xref_rmap(info, irec, agbno); + switch (info->whichfork) { + case XFS_DATA_FORK: + if (xfs_is_reflink_inode(info->sc->ip)) + break; + /* fall through */ + case XFS_ATTR_FORK: + xfs_scrub_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + break; + case XFS_COW_FORK: + xfs_scrub_xref_is_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; + } + + xfs_scrub_ag_free(info->sc, &info->sc->sa); +} + /* Scrub a single extent record. */ STATIC int xfs_scrub_bmap_extent( @@ -109,6 +305,7 @@ xfs_scrub_bmap_extent( { struct xfs_mount *mp = info->sc->mp; struct xfs_buf *bp = NULL; + xfs_filblks_t end; int error = 0; if (cur) @@ -136,19 +333,23 @@ xfs_scrub_bmap_extent( irec->br_startoff); /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > MAXEXTLEN) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock) xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + end = irec->br_startblock + irec->br_blockcount - 1; if (info->is_rt && (!xfs_verify_rtbno(mp, irec->br_startblock) || - !xfs_verify_rtbno(mp, irec->br_startblock + - irec->br_blockcount - 1))) + !xfs_verify_rtbno(mp, end))) xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (!info->is_rt && (!xfs_verify_fsbno(mp, irec->br_startblock) || - !xfs_verify_fsbno(mp, irec->br_startblock + - irec->br_blockcount - 1))) + !xfs_verify_fsbno(mp, end) || + XFS_FSB_TO_AGNO(mp, irec->br_startblock) != + XFS_FSB_TO_AGNO(mp, end))) xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -158,6 +359,11 @@ xfs_scrub_bmap_extent( xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + if (info->is_rt) + xfs_scrub_bmap_rt_extent_xref(info, ip, cur, irec); + else + xfs_scrub_bmap_extent_xref(info, ip, cur, irec); + info->lastoff = irec->br_startoff + irec->br_blockcount; return error; } @@ -235,7 +441,6 @@ xfs_scrub_bmap( struct xfs_ifork *ifp; xfs_fileoff_t endoff; struct xfs_iext_cursor icur; - bool found; int error = 0; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -314,9 +519,7 @@ xfs_scrub_bmap( /* Scrub extent records. */ info.lastoff = 0; ifp = XFS_IFORK_PTR(ip, whichfork); - for (found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &irec); - found != 0; - found = xfs_iext_next_extent(ifp, &icur, &irec)) { + for_each_xfs_iext(ifp, &icur, &irec) { if (xfs_scrub_should_terminate(sc, &error)) break; if (isnullstartblock(irec.br_startblock)) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index df0766132ace..54218168c8f9 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -42,12 +42,14 @@ * Check for btree operation errors. See the section about handling * operational errors in common.c. */ -bool -xfs_scrub_btree_process_error( +static bool +__xfs_scrub_btree_process_error( struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, int level, - int *error) + int *error, + __u32 errflag, + void *ret_ip) { if (*error == 0) return true; @@ -60,36 +62,80 @@ xfs_scrub_btree_process_error( case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; *error = 0; /* fall through */ default: if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xfs_scrub_ifork_btree_op_error(sc, cur, level, - *error, __return_address); + *error, ret_ip); else trace_xfs_scrub_btree_op_error(sc, cur, level, - *error, __return_address); + *error, ret_ip); break; } return false; } +bool +xfs_scrub_btree_process_error( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + return __xfs_scrub_btree_process_error(sc, cur, level, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xfs_scrub_btree_xref_process_error( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + return __xfs_scrub_btree_process_error(sc, cur, level, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + /* Record btree block corruption. */ -void -xfs_scrub_btree_set_corrupt( +static void +__xfs_scrub_btree_set_corrupt( struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, - int level) + int level, + __u32 errflag, + void *ret_ip) { - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xfs_scrub_ifork_btree_error(sc, cur, level, - __return_address); + ret_ip); else trace_xfs_scrub_btree_error(sc, cur, level, - __return_address); + ret_ip); +} + +void +xfs_scrub_btree_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xfs_scrub_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT, + __return_address); +} + +void +xfs_scrub_btree_xref_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xfs_scrub_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT, + __return_address); } /* @@ -268,6 +314,8 @@ xfs_scrub_btree_block_check_sibling( pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock); if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp)) goto out; + if (pbp) + xfs_scrub_buffer_recheck(bs->sc, pbp); if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) xfs_scrub_btree_set_corrupt(bs->sc, cur, level); @@ -315,6 +363,97 @@ out: return error; } +struct check_owner { + struct list_head list; + xfs_daddr_t daddr; + int level; +}; + +/* + * Make sure this btree block isn't in the free list and that there's + * an rmap record for it. + */ +STATIC int +xfs_scrub_btree_check_block_owner( + struct xfs_scrub_btree *bs, + int level, + xfs_daddr_t daddr) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_btnum_t btnum; + bool init_sa; + int error = 0; + + if (!bs->cur) + return 0; + + btnum = bs->cur->bc_btnum; + agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); + agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); + + init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; + if (init_sa) { + error = xfs_scrub_ag_init(bs->sc, agno, &bs->sc->sa); + if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur, + level, &error)) + return error; + } + + xfs_scrub_xref_is_used_space(bs->sc, agbno, 1); + /* + * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we + * have to nullify it (to shut down further block owner checks) if + * self-xref encounters problems. + */ + if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + bs->cur = NULL; + + xfs_scrub_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo); + if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + bs->cur = NULL; + + if (init_sa) + xfs_scrub_ag_free(bs->sc, &bs->sc->sa); + + return error; +} + +/* Check the owner of a btree block. */ +STATIC int +xfs_scrub_btree_check_owner( + struct xfs_scrub_btree *bs, + int level, + struct xfs_buf *bp) +{ + struct xfs_btree_cur *cur = bs->cur; + struct check_owner *co; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL) + return 0; + + /* + * We want to cross-reference each btree block with the bnobt + * and the rmapbt. We cannot cross-reference the bnobt or + * rmapbt while scanning the bnobt or rmapbt, respectively, + * because we cannot alter the cursor and we'd prefer not to + * duplicate cursors. Therefore, save the buffer daddr for + * later scanning. + */ + if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + co = kmem_alloc(sizeof(struct check_owner), + KM_MAYFAIL | KM_NOFS); + if (!co) + return -ENOMEM; + co->level = level; + co->daddr = XFS_BUF_ADDR(bp); + list_add_tail(&co->list, &bs->to_check); + return 0; + } + + return xfs_scrub_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); +} + /* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. @@ -349,6 +488,16 @@ xfs_scrub_btree_get_block( xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } + if (*pbp) + xfs_scrub_buffer_recheck(bs->sc, *pbp); + + /* + * Check the block's owner; this function absorbs error codes + * for us. + */ + error = xfs_scrub_btree_check_owner(bs, level, *pbp); + if (error) + return error; /* * Check the block's siblings; this function absorbs error codes @@ -421,6 +570,8 @@ xfs_scrub_btree( struct xfs_btree_block *block; int level; struct xfs_buf *bp; + struct check_owner *co; + struct check_owner *n; int i; int error = 0; @@ -512,5 +663,14 @@ xfs_scrub_btree( } out: + /* Process deferred owner checks on btree blocks. */ + list_for_each_entry_safe(co, n, &bs.to_check, list) { + if (!error && bs.cur) + error = xfs_scrub_btree_check_block_owner(&bs, + co->level, co->daddr); + list_del(&co->list); + kmem_free(co); + } + return error; } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index 4de825a626d1..e2b868ede70b 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -26,10 +26,19 @@ bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, int level, int *error); +/* Check for btree xref operation errors. */ +bool xfs_scrub_btree_xref_process_error(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level, + int *error); + /* Check for btree corruption. */ void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, int level); +/* Check for btree xref discrepancies. */ +void xfs_scrub_btree_xref_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level); + struct xfs_scrub_btree; typedef int (*xfs_scrub_btree_rec_fn)( struct xfs_scrub_btree *bs, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index ac95fe911d96..8033ab9d8f47 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -78,12 +78,14 @@ */ /* Check for operational errors. */ -bool -xfs_scrub_process_error( +static bool +__xfs_scrub_process_error( struct xfs_scrub_context *sc, xfs_agnumber_t agno, xfs_agblock_t bno, - int *error) + int *error, + __u32 errflag, + void *ret_ip) { switch (*error) { case 0: @@ -95,24 +97,48 @@ xfs_scrub_process_error( case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; *error = 0; /* fall through */ default: trace_xfs_scrub_op_error(sc, agno, bno, *error, - __return_address); + ret_ip); break; } return false; } -/* Check for operational errors for a file offset. */ bool -xfs_scrub_fblock_process_error( +xfs_scrub_process_error( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + return __xfs_scrub_process_error(sc, agno, bno, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xfs_scrub_xref_process_error( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + return __xfs_scrub_process_error(sc, agno, bno, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + +/* Check for operational errors for a file offset. */ +static bool +__xfs_scrub_fblock_process_error( struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset, - int *error) + int *error, + __u32 errflag, + void *ret_ip) { switch (*error) { case 0: @@ -124,17 +150,39 @@ xfs_scrub_fblock_process_error( case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; *error = 0; /* fall through */ default: trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error, - __return_address); + ret_ip); break; } return false; } +bool +xfs_scrub_fblock_process_error( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + return __xfs_scrub_fblock_process_error(sc, whichfork, offset, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xfs_scrub_fblock_xref_process_error( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + return __xfs_scrub_fblock_process_error(sc, whichfork, offset, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + /* * Handling scrub corruption/optimization/warning checks. * @@ -183,6 +231,16 @@ xfs_scrub_block_set_corrupt( trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address); } +/* Record a corruption while cross-referencing. */ +void +xfs_scrub_block_xref_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address); +} + /* * Record a corrupt inode. The trace data will include the block given * by bp if bp is given; otherwise it will use the block location of the @@ -198,6 +256,17 @@ xfs_scrub_ino_set_corrupt( trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); } +/* Record a corruption while cross-referencing with an inode. */ +void +xfs_scrub_ino_xref_set_corrupt( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); +} + /* Record corruption in a block indexed by a file fork. */ void xfs_scrub_fblock_set_corrupt( @@ -209,6 +278,17 @@ xfs_scrub_fblock_set_corrupt( trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address); } +/* Record a corruption while cross-referencing a fork block. */ +void +xfs_scrub_fblock_xref_set_corrupt( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address); +} + /* * Warn about inodes that need administrative review but is not * incorrect. @@ -245,6 +325,59 @@ xfs_scrub_set_incomplete( } /* + * rmap scrubbing -- compute the number of blocks with a given owner, + * at least according to the reverse mapping data. + */ + +struct xfs_scrub_rmap_ownedby_info { + struct xfs_owner_info *oinfo; + xfs_filblks_t *blocks; +}; + +STATIC int +xfs_scrub_count_rmap_ownedby_irec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_scrub_rmap_ownedby_info *sroi = priv; + bool irec_attr; + bool oinfo_attr; + + irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; + oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; + + if (rec->rm_owner != sroi->oinfo->oi_owner) + return 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) + (*sroi->blocks) += rec->rm_blockcount; + + return 0; +} + +/* + * Calculate the number of blocks the rmap thinks are owned by something. + * The caller should pass us an rmapbt cursor. + */ +int +xfs_scrub_count_rmap_ownedby_ag( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + struct xfs_owner_info *oinfo, + xfs_filblks_t *blocks) +{ + struct xfs_scrub_rmap_ownedby_info sroi; + + sroi.oinfo = oinfo; + *blocks = 0; + sroi.blocks = blocks; + + return xfs_rmap_query_all(cur, xfs_scrub_count_rmap_ownedby_irec, + &sroi); +} + +/* * AG scrubbing * * These helpers facilitate locking an allocation group's header @@ -302,7 +435,7 @@ xfs_scrub_ag_read_headers( error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) goto out; - + error = 0; out: return error; } @@ -472,7 +605,7 @@ xfs_scrub_setup_ag_btree( return error; } - error = xfs_scrub_setup_ag_header(sc, ip); + error = xfs_scrub_setup_fs(sc, ip); if (error) return error; @@ -503,18 +636,11 @@ xfs_scrub_get_inode( struct xfs_scrub_context *sc, struct xfs_inode *ip_in) { + struct xfs_imap imap; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = NULL; int error; - /* - * If userspace passed us an AG number or a generation number - * without an inode number, they haven't got a clue so bail out - * immediately. - */ - if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) - return -EINVAL; - /* We want to scan the inode we already had opened. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { sc->ip = ip_in; @@ -526,10 +652,33 @@ xfs_scrub_get_inode( return -ENOENT; error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); - if (error == -ENOENT || error == -EINVAL) { - /* inode doesn't exist... */ - return -ENOENT; - } else if (error) { + switch (error) { + case -ENOENT: + /* Inode doesn't exist, just bail out. */ + return error; + case 0: + /* Got an inode, continue. */ + break; + case -EINVAL: + /* + * -EINVAL with IGET_UNTRUSTED could mean one of several + * things: userspace gave us an inode number that doesn't + * correspond to fs space, or doesn't have an inobt entry; + * or it could simply mean that the inode buffer failed the + * read verifiers. + * + * Try just the inode mapping lookup -- if it succeeds, then + * the inode buffer verifier failed and something needs fixing. + * Otherwise, we really couldn't find it so tell userspace + * that it no longer exists. + */ + error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); + if (error) + return -ENOENT; + error = -EFSCORRUPTED; + /* fall through */ + default: trace_xfs_scrub_op_error(sc, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), @@ -572,3 +721,61 @@ out: /* scrub teardown will unlock and release the inode for us */ return error; } + +/* + * Predicate that decides if we need to evaluate the cross-reference check. + * If there was an error accessing the cross-reference btree, just delete + * the cursor and skip the check. + */ +bool +xfs_scrub_should_check_xref( + struct xfs_scrub_context *sc, + int *error, + struct xfs_btree_cur **curpp) +{ + if (*error == 0) + return true; + + if (curpp) { + /* If we've already given up on xref, just bail out. */ + if (!*curpp) + return false; + + /* xref error, delete cursor and bail out. */ + xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); + *curpp = NULL; + } + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + trace_xfs_scrub_xref_error(sc, *error, __return_address); + + /* + * Errors encountered during cross-referencing with another + * data structure should not cause this scrubber to abort. + */ + *error = 0; + return false; +} + +/* Run the structure verifiers on in-memory buffers to detect bad memory. */ +void +xfs_scrub_buffer_recheck( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (bp->b_ops == NULL) { + xfs_scrub_block_set_corrupt(sc, bp); + return; + } + if (bp->b_ops->verify_struct == NULL) { + xfs_scrub_set_incomplete(sc); + return; + } + fa = bp->b_ops->verify_struct(bp); + if (!fa) + return; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_block_error(sc, bp->b_bn, fa); +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 5c043855570e..ddb65d22c76a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -56,6 +56,11 @@ bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset, int *error); +bool xfs_scrub_xref_process_error(struct xfs_scrub_context *sc, + xfs_agnumber_t agno, xfs_agblock_t bno, int *error); +bool xfs_scrub_fblock_xref_process_error(struct xfs_scrub_context *sc, + int whichfork, xfs_fileoff_t offset, int *error); + void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc, struct xfs_buf *bp); void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino, @@ -68,6 +73,13 @@ void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); +void xfs_scrub_block_xref_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_buf *bp); +void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); +void xfs_scrub_fblock_xref_set_corrupt(struct xfs_scrub_context *sc, + int whichfork, xfs_fileoff_t offset); + void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino, struct xfs_buf *bp); void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, @@ -76,10 +88,12 @@ void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc); int xfs_scrub_checkpoint_log(struct xfs_mount *mp); +/* Are we set up for a cross-referencing check? */ +bool xfs_scrub_should_check_xref(struct xfs_scrub_context *sc, int *error, + struct xfs_btree_cur **curpp); + /* Setup functions */ int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip); -int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc, - struct xfs_inode *ip); int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc, struct xfs_inode *ip); int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc, @@ -134,11 +148,16 @@ int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, void *), void *priv); +int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + struct xfs_owner_info *oinfo, + xfs_filblks_t *blocks); int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc, struct xfs_inode *ip, bool force_log); int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in); int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, struct xfs_inode *ip, unsigned int resblks); +void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index d94edd93cba8..bffdb7dc09bf 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -233,11 +233,28 @@ xfs_scrub_da_btree_write_verify( return; } } +static void * +xfs_scrub_da_btree_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + return bp->b_ops->verify_struct(bp); + default: + bp->b_ops = &xfs_da3_node_buf_ops; + return bp->b_ops->verify_struct(bp); + } +} static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = { .name = "xfs_scrub_da_btree", .verify_read = xfs_scrub_da_btree_read_verify, .verify_write = xfs_scrub_da_btree_write_verify, + .verify_struct = xfs_scrub_da_btree_verify, }; /* Check a block's sibling. */ @@ -276,6 +293,9 @@ xfs_scrub_da_btree_block_check_sibling( xfs_scrub_da_set_corrupt(ds, level); return error; } + if (ds->state->altpath.blk[level].bp) + xfs_scrub_buffer_recheck(ds->sc, + ds->state->altpath.blk[level].bp); /* Compare upper level pointer to sibling pointer. */ if (ds->state->altpath.blk[level].blkno != sibling) @@ -358,6 +378,8 @@ xfs_scrub_da_btree_block( &xfs_scrub_da_btree_buf_ops); if (!xfs_scrub_da_process_error(ds, level, &error)) goto out_nobuf; + if (blk->bp) + xfs_scrub_buffer_recheck(ds->sc, blk->bp); /* * We didn't find a dir btree root block, which means that diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 69e1efdd4019..50b6a26b0299 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -92,7 +92,7 @@ xfs_scrub_dir_check_ftype( * inodes can trigger immediate inactive cleanup of the inode. */ error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); - if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, + if (!xfs_scrub_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out; @@ -200,6 +200,7 @@ xfs_scrub_dir_rec( struct xfs_inode *dp = ds->dargs.dp; struct xfs_dir2_data_entry *dent; struct xfs_buf *bp; + char *p, *endp; xfs_ino_t ino; xfs_dablk_t rec_bno; xfs_dir2_db_t db; @@ -237,9 +238,37 @@ xfs_scrub_dir_rec( xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out; } + xfs_scrub_buffer_recheck(ds->sc, bp); - /* Retrieve the entry, sanity check it, and compare hashes. */ dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); + + /* Make sure we got a real directory entry. */ + p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr); + endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); + if (!endp) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + while (p < endp) { + struct xfs_dir2_data_entry *dep; + struct xfs_dir2_data_unused *dup; + + dup = (struct xfs_dir2_data_unused *)p; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + p += be16_to_cpu(dup->length); + continue; + } + dep = (struct xfs_dir2_data_entry *)p; + if (dep == dent) + break; + p += mp->m_dir_inode_ops->data_entsize(dep->namelen); + } + if (p >= endp) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + + /* Retrieve the entry, sanity check it, and compare hashes. */ ino = be64_to_cpu(dent->inumber); hash = be32_to_cpu(ent->hashval); tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent)); @@ -324,6 +353,7 @@ xfs_scrub_directory_data_bestfree( } if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; + xfs_scrub_buffer_recheck(sc, bp); /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ @@ -361,13 +391,7 @@ xfs_scrub_directory_data_bestfree( /* Make sure the bestfrees are actually the best free spaces. */ ptr = (char *)d_ops->data_entry_p(bp->b_addr); - if (is_block) { - struct xfs_dir2_block_tail *btp; - - btp = xfs_dir2_block_tail_p(mp->m_dir_geo, bp->b_addr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); - } else - endptr = (char *)bp->b_addr + BBTOB(bp->b_length); + endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); /* Iterate the entries, stopping when we hit or go past the end. */ while (ptr < endptr) { @@ -474,6 +498,7 @@ xfs_scrub_directory_leaf1_bestfree( error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; + xfs_scrub_buffer_recheck(sc, bp); leaf = bp->b_addr; d_ops->leaf_hdr_from_disk(&leafhdr, leaf); @@ -559,6 +584,7 @@ xfs_scrub_directory_free_bestfree( error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; + xfs_scrub_buffer_recheck(sc, bp); if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 496d6f2fbb9e..63ab3f98430d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -58,6 +58,56 @@ xfs_scrub_setup_ag_iallocbt( /* Inode btree scrubber. */ +/* + * If we're checking the finobt, cross-reference with the inobt. + * Otherwise we're checking the inobt; if there is an finobt, make sure + * we have a record or not depending on freecount. + */ +static inline void +xfs_scrub_iallocbt_chunk_xref_other( + struct xfs_scrub_context *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino) +{ + struct xfs_btree_cur **pcur; + bool has_irec; + int error; + + if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) + pcur = &sc->sa.ino_cur; + else + pcur = &sc->sa.fino_cur; + if (!(*pcur)) + return; + error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec); + if (!xfs_scrub_should_check_xref(sc, &error, pcur)) + return; + if (((irec->ir_freecount > 0 && !has_irec) || + (irec->ir_freecount == 0 && has_irec))) + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_iallocbt_chunk_xref( + struct xfs_scrub_context *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_owner_info oinfo; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, len); + xfs_scrub_iallocbt_chunk_xref_other(sc, irec, agino); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + xfs_scrub_xref_is_owned_by(sc, agbno, len, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, len); +} + /* Is this chunk worth checking? */ STATIC bool xfs_scrub_iallocbt_chunk( @@ -76,6 +126,8 @@ xfs_scrub_iallocbt_chunk( !xfs_verify_agbno(mp, agno, bno + len - 1)) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + xfs_scrub_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); + return true; } @@ -190,8 +242,14 @@ xfs_scrub_iallocbt_check_freemask( } /* If any part of this is a hole, skip it. */ - if (ir_holemask) + if (ir_holemask) { + xfs_scrub_xref_is_not_owned_by(bs->sc, agbno, + blks_per_cluster, &oinfo); continue; + } + + xfs_scrub_xref_is_owned_by(bs->sc, agbno, blks_per_cluster, + &oinfo); /* Grab the inode cluster buffer. */ imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, @@ -227,6 +285,7 @@ xfs_scrub_iallocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + xfs_filblks_t *inode_blocks = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; @@ -264,6 +323,9 @@ xfs_scrub_iallocbt_rec( (agbno & (xfs_icluster_size_fsb(mp) - 1))) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + *inode_blocks += XFS_B_TO_FSB(mp, + irec.ir_count * mp->m_sb.sb_inodesize); + /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { len = XFS_B_TO_FSB(mp, @@ -308,6 +370,72 @@ out: return error; } +/* + * Make sure the inode btrees are as large as the rmap thinks they are. + * Don't bother if we're missing btree cursors, as we're already corrupt. + */ +STATIC void +xfs_scrub_iallocbt_xref_rmap_btreeblks( + struct xfs_scrub_context *sc, + int which) +{ + struct xfs_owner_info oinfo; + xfs_filblks_t blocks; + xfs_extlen_t inobt_blocks = 0; + xfs_extlen_t finobt_blocks = 0; + int error; + + if (!sc->sa.ino_cur || !sc->sa.rmap_cur || + (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur)) + return; + + /* Check that we saw as many inobt blocks as the rmap says. */ + error = xfs_btree_count_blocks(sc->sa.ino_cur, &inobt_blocks); + if (!xfs_scrub_process_error(sc, 0, 0, &error)) + return; + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &finobt_blocks); + if (!xfs_scrub_process_error(sc, 0, 0, &error)) + return; + } + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != inobt_blocks + finobt_blocks) + xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0); +} + +/* + * Make sure that the inobt records point to the same number of blocks as + * the rmap says are owned by inodes. + */ +STATIC void +xfs_scrub_iallocbt_xref_rmap_inodes( + struct xfs_scrub_context *sc, + int which, + xfs_filblks_t inode_blocks) +{ + struct xfs_owner_info oinfo; + xfs_filblks_t blocks; + int error; + + if (!sc->sa.rmap_cur) + return; + + /* Check that we saw as many inode blocks as the rmap knows about. */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != inode_blocks) + xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0); +} + /* Scrub the inode btrees for some AG. */ STATIC int xfs_scrub_iallocbt( @@ -316,10 +444,29 @@ xfs_scrub_iallocbt( { struct xfs_btree_cur *cur; struct xfs_owner_info oinfo; + xfs_filblks_t inode_blocks = 0; + int error; xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; - return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, NULL); + error = xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, + &inode_blocks); + if (error) + return error; + + xfs_scrub_iallocbt_xref_rmap_btreeblks(sc, which); + + /* + * If we're scrubbing the inode btree, inode_blocks is the number of + * blocks pointed to by all the inode chunk records. Therefore, we + * should compare to the number of inode chunk blocks that the rmap + * knows about. We can't do this for the finobt since it only points + * to inode chunks with free inodes. + */ + if (which == XFS_BTNUM_INO) + xfs_scrub_iallocbt_xref_rmap_inodes(sc, which, inode_blocks); + + return error; } int @@ -335,3 +482,46 @@ xfs_scrub_finobt( { return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO); } + +/* See if an inode btree has (or doesn't have) an inode chunk record. */ +static inline void +xfs_scrub_xref_inode_check( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len, + struct xfs_btree_cur **icur, + bool should_have_inodes) +{ + bool has_inodes; + int error; + + if (!(*icur)) + return; + + error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); + if (!xfs_scrub_should_check_xref(sc, &error, icur)) + return; + if (has_inodes != should_have_inodes) + xfs_scrub_btree_xref_set_corrupt(sc, *icur, 0); +} + +/* xref check that the extent is not covered by inodes */ +void +xfs_scrub_xref_is_not_inode_chunk( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false); + xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false); +} + +/* xref check that the extent is covered by inodes */ +void +xfs_scrub_xref_is_inode_chunk( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true); +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 637b7a892313..21297bef8df1 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -36,9 +36,13 @@ #include "xfs_ialloc.h" #include "xfs_da_format.h" #include "xfs_reflink.h" +#include "xfs_rmap.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/btree.h" #include "scrub/trace.h" /* @@ -64,7 +68,7 @@ xfs_scrub_setup_inode( break; case -EFSCORRUPTED: case -EFSBADCRC: - return 0; + return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); default: return error; } @@ -318,8 +322,20 @@ xfs_scrub_dinode( /* di_mode */ mode = be16_to_cpu(dip->di_mode); - if (mode & ~(S_IALLUGO | S_IFMT)) + switch (mode & S_IFMT) { + case S_IFLNK: + case S_IFREG: + case S_IFDIR: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + /* mode is recognized */ + break; + default: xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } /* v1/v2 fields */ switch (dip->di_version) { @@ -380,6 +396,14 @@ xfs_scrub_dinode( break; } + /* di_[amc]time.nsec */ + if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + /* * di_size. xfs_dinode_verify checks for things that screw up * the VFS such as the upper bit being set and zero-length @@ -483,6 +507,8 @@ xfs_scrub_dinode( } if (dip->di_version >= 3) { + if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2); xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags, flags2); @@ -534,7 +560,7 @@ xfs_scrub_inode_map_raw( */ bp->b_ops = &xfs_inode_buf_ops; dip = xfs_buf_offset(bp, imap.im_boffset); - if (!xfs_dinode_verify(mp, ino, dip) || + if (xfs_dinode_verify(mp, ino, dip) != NULL || !xfs_dinode_good_version(mp, dip->di_version)) { xfs_scrub_ino_set_corrupt(sc, ino, bp); goto out_buf; @@ -555,18 +581,155 @@ out_buf: return error; } +/* + * Make sure the finobt doesn't think this inode is free. + * We don't have to check the inobt ourselves because we got the inode via + * IGET_UNTRUSTED, which checks the inobt for us. + */ +static void +xfs_scrub_inode_xref_finobt( + struct xfs_scrub_context *sc, + xfs_ino_t ino) +{ + struct xfs_inobt_rec_incore rec; + xfs_agino_t agino; + int has_record; + int error; + + if (!sc->sa.fino_cur) + return; + + agino = XFS_INO_TO_AGINO(sc->mp, ino); + + /* + * Try to get the finobt record. If we can't get it, then we're + * in good shape. + */ + error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE, + &has_record); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.fino_cur) || + !has_record) + return; + + error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.fino_cur) || + !has_record) + return; + + /* + * Otherwise, make sure this record either doesn't cover this inode, + * or that it does but it's marked present. + */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + return; + + if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0); +} + +/* Cross reference the inode fields with the forks. */ +STATIC void +xfs_scrub_inode_xref_bmap( + struct xfs_scrub_context *sc, + struct xfs_dinode *dip) +{ + xfs_extnum_t nextents; + xfs_filblks_t count; + xfs_filblks_t acount; + int error; + + /* Walk all the extents to check nextents/naextents/nblocks. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (!xfs_scrub_should_check_xref(sc, &error, NULL)) + return; + if (nextents < be32_to_cpu(dip->di_nextents)) + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (!xfs_scrub_should_check_xref(sc, &error, NULL)) + return; + if (nextents != be16_to_cpu(dip->di_anextents)) + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + + /* Check nblocks against the inode. */ + if (count + acount != be64_to_cpu(dip->di_nblocks)) + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_inode_xref( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_dinode *dip) +{ + struct xfs_owner_info oinfo; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agno = XFS_INO_TO_AGNO(sc->mp, ino); + agbno = XFS_INO_TO_AGBNO(sc->mp, ino); + + error = xfs_scrub_ag_init(sc, agno, &sc->sa); + if (!xfs_scrub_xref_process_error(sc, agno, agbno, &error)) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_inode_xref_finobt(sc, ino); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + xfs_scrub_inode_xref_bmap(sc, dip); + + xfs_scrub_ag_free(sc, &sc->sa); +} + +/* + * If the reflink iflag disagrees with a scan for shared data fork extents, + * either flag an error (shared extents w/ no flag) or a preen (flag set w/o + * any shared extents). We already checked for reflink iflag set on a non + * reflink filesystem. + */ +static void +xfs_scrub_inode_check_reflink_iflag( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + bool has_shared; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return; + + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &has_shared); + if (!xfs_scrub_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + return; + if (xfs_is_reflink_inode(sc->ip) && !has_shared) + xfs_scrub_ino_set_preen(sc, ino, bp); + else if (!xfs_is_reflink_inode(sc->ip) && has_shared) + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + /* Scrub an inode. */ int xfs_scrub_inode( struct xfs_scrub_context *sc) { struct xfs_dinode di; - struct xfs_mount *mp = sc->mp; struct xfs_buf *bp = NULL; struct xfs_dinode *dip; xfs_ino_t ino; - - bool has_shared; int error = 0; /* Did we get the in-core inode, or are we doing this manually? */ @@ -591,19 +754,14 @@ xfs_scrub_inode( goto out; /* - * Does this inode have the reflink flag set but no shared extents? - * Set the preening flag if this is the case. + * Look for discrepancies between file's data blocks and the reflink + * iflag. We already checked the iflag against the file mode when + * we scrubbed the dinode. */ - if (xfs_is_reflink_inode(sc->ip)) { - error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, - &has_shared); - if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), - XFS_INO_TO_AGBNO(mp, ino), &error)) - goto out; - if (!has_shared) - xfs_scrub_ino_set_preen(sc, ino, bp); - } + if (S_ISREG(VFS_I(sc->ip)->i_mode)) + xfs_scrub_inode_check_reflink_iflag(sc, ino, bp); + xfs_scrub_inode_xref(sc, ino, dip); out: if (bp) xfs_trans_brelse(sc->tp, bp); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 63a25334fc83..0d3851410c74 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -169,9 +169,9 @@ xfs_scrub_parent_validate( * immediate inactive cleanup of the inode. */ error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; - if (dp == sc->ip) { + if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } @@ -185,7 +185,7 @@ xfs_scrub_parent_validate( */ if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; if (nlink != expected_nlink) @@ -205,7 +205,7 @@ xfs_scrub_parent_validate( /* Go looking for our dentry. */ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; /* Drop the parent lock, relock this inode. */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 8e58ba842946..51daa4ae2627 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -67,13 +67,6 @@ xfs_scrub_setup_quota( { uint dqtype; - /* - * If userspace gave us an AG number or inode data, they don't - * know what they're doing. Get out. - */ - if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) - return -EINVAL; - dqtype = xfs_scrub_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; @@ -107,7 +100,7 @@ xfs_scrub_quota_item( unsigned long long rcount; xfs_ino_t fs_icount; - offset = id * qi->qi_dqperchunk; + offset = id / qi->qi_dqperchunk; /* * We fed $id and DQNEXT into the xfs_qm_dqget call, which means @@ -207,7 +200,7 @@ xfs_scrub_quota( xfs_dqid_t id = 0; uint dqtype; int nimaps; - int error; + int error = 0; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return -ENOENT; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 2f88a8d44bd0..400f1561cd3d 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -31,6 +31,7 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -50,6 +51,307 @@ xfs_scrub_setup_ag_refcountbt( /* Reference count btree scrubber. */ +/* + * Confirming Reference Counts via Reverse Mappings + * + * We want to count the reverse mappings overlapping a refcount record + * (bno, len, refcount), allowing for the possibility that some of the + * overlap may come from smaller adjoining reverse mappings, while some + * comes from single extents which overlap the range entirely. The + * outer loop is as follows: + * + * 1. For all reverse mappings overlapping the refcount extent, + * a. If a given rmap completely overlaps, mark it as seen. + * b. Otherwise, record the fragment (in agbno order) for later + * processing. + * + * Once we've seen all the rmaps, we know that for all blocks in the + * refcount record we want to find $refcount owners and we've already + * visited $seen extents that overlap all the blocks. Therefore, we + * need to find ($refcount - $seen) owners for every block in the + * extent; call that quantity $target_nr. Proceed as follows: + * + * 2. Pull the first $target_nr fragments from the list; all of them + * should start at or before the start of the extent. + * Call this subset of fragments the working set. + * 3. Until there are no more unprocessed fragments, + * a. Find the shortest fragments in the set and remove them. + * b. Note the block number of the end of these fragments. + * c. Pull the same number of fragments from the list. All of these + * fragments should start at the block number recorded in the + * previous step. + * d. Put those fragments in the set. + * 4. Check that there are $target_nr fragments remaining in the list, + * and that they all end at or beyond the end of the refcount extent. + * + * If the refcount is correct, all the check conditions in the algorithm + * should always hold true. If not, the refcount is incorrect. + */ +struct xfs_scrub_refcnt_frag { + struct list_head list; + struct xfs_rmap_irec rm; +}; + +struct xfs_scrub_refcnt_check { + struct xfs_scrub_context *sc; + struct list_head fragments; + + /* refcount extent we're examining */ + xfs_agblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + + /* number of owners seen */ + xfs_nlink_t seen; +}; + +/* + * Decide if the given rmap is large enough that we can redeem it + * towards refcount verification now, or if it's a fragment, in + * which case we'll hang onto it in the hopes that we'll later + * discover that we've collected exactly the correct number of + * fragments as the refcountbt says we should have. + */ +STATIC int +xfs_scrub_refcountbt_rmap_check( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_scrub_refcnt_check *refchk = priv; + struct xfs_scrub_refcnt_frag *frag; + xfs_agblock_t rm_last; + xfs_agblock_t rc_last; + int error = 0; + + if (xfs_scrub_should_terminate(refchk->sc, &error)) + return error; + + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; + rc_last = refchk->bno + refchk->len - 1; + + /* Confirm that a single-owner refc extent is a CoW stage. */ + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { + xfs_scrub_btree_xref_set_corrupt(refchk->sc, cur, 0); + return 0; + } + + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { + /* + * The rmap overlaps the refcount record, so we can confirm + * one refcount owner seen. + */ + refchk->seen++; + } else { + /* + * This rmap covers only part of the refcount record, so + * save the fragment for later processing. If the rmapbt + * is healthy each rmap_irec we see will be in agbno order + * so we don't need insertion sort here. + */ + frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag), + KM_MAYFAIL | KM_NOFS); + if (!frag) + return -ENOMEM; + memcpy(&frag->rm, rec, sizeof(frag->rm)); + list_add_tail(&frag->list, &refchk->fragments); + } + + return 0; +} + +/* + * Given a bunch of rmap fragments, iterate through them, keeping + * a running tally of the refcount. If this ever deviates from + * what we expect (which is the refcountbt's refcount minus the + * number of extents that totally covered the refcountbt extent), + * we have a refcountbt error. + */ +STATIC void +xfs_scrub_refcountbt_process_rmap_fragments( + struct xfs_scrub_refcnt_check *refchk) +{ + struct list_head worklist; + struct xfs_scrub_refcnt_frag *frag; + struct xfs_scrub_refcnt_frag *n; + xfs_agblock_t bno; + xfs_agblock_t rbno; + xfs_agblock_t next_rbno; + xfs_nlink_t nr; + xfs_nlink_t target_nr; + + target_nr = refchk->refcount - refchk->seen; + if (target_nr == 0) + return; + + /* + * There are (refchk->rc.rc_refcount - refchk->nr refcount) + * references we haven't found yet. Pull that many off the + * fragment list and figure out where the smallest rmap ends + * (and therefore the next rmap should start). All the rmaps + * we pull off should start at or before the beginning of the + * refcount record's range. + */ + INIT_LIST_HEAD(&worklist); + rbno = NULLAGBLOCK; + nr = 1; + + /* Make sure the fragments actually /are/ in agbno order. */ + bno = 0; + list_for_each_entry(frag, &refchk->fragments, list) { + if (frag->rm.rm_startblock < bno) + goto done; + bno = frag->rm.rm_startblock; + } + + /* + * Find all the rmaps that start at or before the refc extent, + * and put them on the worklist. + */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + if (frag->rm.rm_startblock > refchk->bno) + goto done; + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno < rbno) + rbno = bno; + list_move_tail(&frag->list, &worklist); + if (nr == target_nr) + break; + nr++; + } + + /* + * We should have found exactly $target_nr rmap fragments starting + * at or before the refcount extent. + */ + if (nr != target_nr) + goto done; + + while (!list_empty(&refchk->fragments)) { + /* Discard any fragments ending at rbno from the worklist. */ + nr = 0; + next_rbno = NULLAGBLOCK; + list_for_each_entry_safe(frag, n, &worklist, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno != rbno) { + if (bno < next_rbno) + next_rbno = bno; + continue; + } + list_del(&frag->list); + kmem_free(frag); + nr++; + } + + /* Try to add nr rmaps starting at rbno to the worklist. */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (frag->rm.rm_startblock != rbno) + goto done; + list_move_tail(&frag->list, &worklist); + if (next_rbno > bno) + next_rbno = bno; + nr--; + if (nr == 0) + break; + } + + /* + * If we get here and nr > 0, this means that we added fewer + * items to the worklist than we discarded because the fragment + * list ran out of items. Therefore, we cannot maintain the + * required refcount. Something is wrong, so we're done. + */ + if (nr) + goto done; + + rbno = next_rbno; + } + + /* + * Make sure the last extent we processed ends at or beyond + * the end of the refcount extent. + */ + if (rbno < refchk->bno + refchk->len) + goto done; + + /* Actually record us having seen the remaining refcount. */ + refchk->seen = refchk->refcount; +done: + /* Delete fragments and work list. */ + list_for_each_entry_safe(frag, n, &worklist, list) { + list_del(&frag->list); + kmem_free(frag); + } + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + list_del(&frag->list); + kmem_free(frag); + } +} + +/* Use the rmap entries covering this extent to verify the refcount. */ +STATIC void +xfs_scrub_refcountbt_xref_rmap( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_nlink_t refcount) +{ + struct xfs_scrub_refcnt_check refchk = { + .sc = sc, + .bno = bno, + .len = len, + .refcount = refcount, + .seen = 0, + }; + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + struct xfs_scrub_refcnt_frag *frag; + struct xfs_scrub_refcnt_frag *n; + int error; + + if (!sc->sa.rmap_cur) + return; + + /* Cross-reference with the rmapbt to confirm the refcount. */ + memset(&low, 0, sizeof(low)); + low.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno + len - 1; + + INIT_LIST_HEAD(&refchk.fragments); + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + &xfs_scrub_refcountbt_rmap_check, &refchk); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + goto out_free; + + xfs_scrub_refcountbt_process_rmap_fragments(&refchk); + if (refcount != refchk.seen) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + +out_free: + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { + list_del(&frag->list); + kmem_free(frag); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_refcountbt_xref( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len, + xfs_nlink_t refcount) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, len); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len); + xfs_scrub_refcountbt_xref_rmap(sc, agbno, len, refcount); +} + /* Scrub a refcountbt record. */ STATIC int xfs_scrub_refcountbt_rec( @@ -57,6 +359,7 @@ xfs_scrub_refcountbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agblock_t *cow_blocks = bs->private; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; @@ -72,6 +375,8 @@ xfs_scrub_refcountbt_rec( has_cowflag = (bno & XFS_REFC_COW_START); if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + if (has_cowflag) + (*cow_blocks) += len; /* Check the extent. */ bno &= ~XFS_REFC_COW_START; @@ -83,17 +388,128 @@ xfs_scrub_refcountbt_rec( if (refcount == 0) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + xfs_scrub_refcountbt_xref(bs->sc, bno, len, refcount); + return error; } +/* Make sure we have as many refc blocks as the rmap says. */ +STATIC void +xfs_scrub_refcount_xref_rmap( + struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, + xfs_filblks_t cow_blocks) +{ + xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t blocks; + int error; + + if (!sc->sa.rmap_cur) + return; + + /* Check that we saw as many refcbt blocks as the rmap knows about. */ + error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks); + if (!xfs_scrub_btree_process_error(sc, sc->sa.refc_cur, 0, &error)) + return; + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != refcbt_blocks) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + + /* Check that we saw as many cow blocks as the rmap knows about. */ + xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW); + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != cow_blocks) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + /* Scrub the refcount btree for some AG. */ int xfs_scrub_refcountbt( struct xfs_scrub_context *sc) { struct xfs_owner_info oinfo; + xfs_agblock_t cow_blocks = 0; + int error; xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); - return xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec, - &oinfo, NULL); + error = xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec, + &oinfo, &cow_blocks); + if (error) + return error; + + xfs_scrub_refcount_xref_rmap(sc, &oinfo, cow_blocks); + + return 0; +} + +/* xref check that a cow staging extent is marked in the refcountbt. */ +void +xfs_scrub_xref_is_cow_staging( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_refcount_irec rc; + bool has_cowflag; + int has_refcount; + int error; + + if (!sc->sa.refc_cur) + return; + + /* Find the CoW staging extent. */ + error = xfs_refcount_lookup_le(sc->sa.refc_cur, + agbno + XFS_REFC_COW_START, &has_refcount); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (!has_refcount) { + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + return; + } + + error = xfs_refcount_get_rec(sc->sa.refc_cur, &rc, &has_refcount); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (!has_refcount) { + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + return; + } + + /* CoW flag must be set, refcount must be 1. */ + has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START); + if (!has_cowflag || rc.rc_refcount != 1) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + + /* Must be at least as long as what was passed in */ + if (rc.rc_blockcount < len) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* + * xref check that the extent is not shared. Only file data blocks + * can have multiple owners. + */ +void +xfs_scrub_xref_is_not_shared( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + bool shared; + int error; + + if (!sc->sa.refc_cur) + return; + + error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (shared) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); } diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 97846c424690..8f2a7c3ff455 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -32,6 +32,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -51,6 +52,61 @@ xfs_scrub_setup_ag_rmapbt( /* Reverse-mapping scrubber. */ +/* Cross-reference a rmap against the refcount btree. */ +STATIC void +xfs_scrub_rmapbt_xref_refc( + struct xfs_scrub_context *sc, + struct xfs_rmap_irec *irec) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + bool non_inode; + bool is_bmbt; + bool is_attr; + bool is_unwritten; + int error; + + if (!sc->sa.refc_cur) + return; + + non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + /* If this is shared, must be a data fork extent. */ + error = xfs_refcount_find_shared(sc->sa.refc_cur, irec->rm_startblock, + irec->rm_blockcount, &fbno, &flen, false); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (flen != 0 && (non_inode || is_attr || is_bmbt || is_unwritten)) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_rmapbt_xref( + struct xfs_scrub_context *sc, + struct xfs_rmap_irec *irec) +{ + xfs_agblock_t agbno = irec->rm_startblock; + xfs_extlen_t len = irec->rm_blockcount; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, len); + if (irec->rm_owner == XFS_RMAP_OWN_INODES) + xfs_scrub_xref_is_inode_chunk(sc, agbno, len); + else + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len); + if (irec->rm_owner == XFS_RMAP_OWN_COW) + xfs_scrub_xref_is_cow_staging(sc, irec->rm_startblock, + irec->rm_blockcount); + else + xfs_scrub_rmapbt_xref_refc(sc, irec); +} + /* Scrub an rmapbt record. */ STATIC int xfs_scrub_rmapbt_rec( @@ -121,6 +177,8 @@ xfs_scrub_rmapbt_rec( irec.rm_owner > XFS_RMAP_OWN_FS) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); } + + xfs_scrub_rmapbt_xref(bs->sc, &irec); out: return error; } @@ -136,3 +194,68 @@ xfs_scrub_rmapbt( return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec, &oinfo, NULL); } + +/* xref check that the extent is owned by a given owner */ +static inline void +xfs_scrub_xref_check_owner( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool should_have_rmap) +{ + bool has_rmap; + int error; + + if (!sc->sa.rmap_cur) + return; + + error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, + &has_rmap); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (has_rmap != should_have_rmap) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + +/* xref check that the extent is owned by a given owner */ +void +xfs_scrub_xref_is_owned_by( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + xfs_scrub_xref_check_owner(sc, bno, len, oinfo, true); +} + +/* xref check that the extent is not owned by a given owner */ +void +xfs_scrub_xref_is_not_owned_by( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + xfs_scrub_xref_check_owner(sc, bno, len, oinfo, false); +} + +/* xref check that the extent has no reverse mapping at all */ +void +xfs_scrub_xref_has_no_owner( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + bool has_rmap; + int error; + + if (!sc->sa.rmap_cur) + return; + + error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (has_rmap) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index c6fedb698008..26390991369a 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -43,22 +43,14 @@ xfs_scrub_setup_rt( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; - int error = 0; - - /* - * If userspace gave us an AG number or inode data, they don't - * know what they're doing. Get out. - */ - if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) - return -EINVAL; + int error; error = xfs_scrub_setup_fs(sc, ip); if (error) return error; sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; - sc->ip = mp->m_rbmip; + sc->ip = sc->mp->m_rbmip; xfs_ilock(sc->ip, sc->ilock_flags); return 0; @@ -106,3 +98,26 @@ xfs_scrub_rtsummary( /* XXX: implement this some day */ return -ENOENT; } + + +/* xref check that the extent is not free in the rtbitmap */ +void +xfs_scrub_xref_is_used_rt_space( + struct xfs_scrub_context *sc, + xfs_rtblock_t fsbno, + xfs_extlen_t len) +{ + bool is_free; + int error; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len, + &is_free); + if (!xfs_scrub_should_check_xref(sc, &error, NULL)) + goto out_unlock; + if (is_free) + xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino, + NULL); +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9c42c4efd01e..26c75967a072 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -46,7 +46,6 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" -#include "scrub/scrub.h" #include "scrub/btree.h" /* @@ -111,6 +110,16 @@ * structure itself is corrupt, the CORRUPT flag will be set. If * the metadata is correct but otherwise suboptimal, the PREEN flag * will be set. + * + * We perform secondary validation of filesystem metadata by + * cross-referencing every record with all other available metadata. + * For example, for block mapping extents, we verify that there are no + * records in the free space and inode btrees corresponding to that + * space extent and that there is a corresponding entry in the reverse + * mapping btree. Inconsistent metadata is noted by setting the + * XCORRUPT flag; btree query function errors are noted by setting the + * XFAIL flag and deleting the cursor to prevent further attempts to + * cross-reference with a defective btree. */ /* @@ -129,8 +138,6 @@ xfs_scrub_probe( { int error = 0; - if (sc->sm->sm_ino || sc->sm->sm_agno) - return -EINVAL; if (xfs_scrub_should_terminate(sc, &error)) return error; @@ -152,7 +159,8 @@ xfs_scrub_teardown( sc->tp = NULL; } if (sc->ip) { - xfs_iunlock(sc->ip, sc->ilock_flags); + if (sc->ilock_flags) + xfs_iunlock(sc->ip, sc->ilock_flags); if (sc->ip != ip_in && !xfs_internal_inum(sc->mp, sc->ip->i_ino)) iput(VFS_I(sc->ip)); @@ -168,106 +176,130 @@ xfs_scrub_teardown( /* Scrubbing dispatch. */ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { - { /* ioctl presence test */ + [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ + .type = ST_NONE, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_probe, }, - { /* superblock */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_SB] = { /* superblock */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_superblock, }, - { /* agf */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_AGF] = { /* agf */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agf, }, - { /* agfl */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agfl, }, - { /* agi */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_AGI] = { /* agi */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agi, }, - { /* bnobt */ + [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_bnobt, }, - { /* cntbt */ + [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_cntbt, }, - { /* inobt */ + [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_inobt, }, - { /* finobt */ + [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_finobt, .has = xfs_sb_version_hasfinobt, }, - { /* rmapbt */ + [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_rmapbt, .scrub = xfs_scrub_rmapbt, .has = xfs_sb_version_hasrmapbt, }, - { /* refcountbt */ + [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_refcountbt, .scrub = xfs_scrub_refcountbt, .has = xfs_sb_version_hasreflink, }, - { /* inode record */ + [XFS_SCRUB_TYPE_INODE] = { /* inode record */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode, .scrub = xfs_scrub_inode, }, - { /* inode data fork */ + [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_data, }, - { /* inode attr fork */ + [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_attr, }, - { /* inode CoW fork */ + [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_cow, }, - { /* directory */ + [XFS_SCRUB_TYPE_DIR] = { /* directory */ + .type = ST_INODE, .setup = xfs_scrub_setup_directory, .scrub = xfs_scrub_directory, }, - { /* extended attributes */ + [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ + .type = ST_INODE, .setup = xfs_scrub_setup_xattr, .scrub = xfs_scrub_xattr, }, - { /* symbolic link */ + [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ + .type = ST_INODE, .setup = xfs_scrub_setup_symlink, .scrub = xfs_scrub_symlink, }, - { /* parent pointers */ + [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ + .type = ST_INODE, .setup = xfs_scrub_setup_parent, .scrub = xfs_scrub_parent, }, - { /* realtime bitmap */ + [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ + .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtbitmap, .has = xfs_sb_version_hasrealtime, }, - { /* realtime summary */ + [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ + .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtsummary, .has = xfs_sb_version_hasrealtime, }, - { /* user quota */ - .setup = xfs_scrub_setup_quota, - .scrub = xfs_scrub_quota, + [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ + .type = ST_FS, + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, }, - { /* group quota */ - .setup = xfs_scrub_setup_quota, - .scrub = xfs_scrub_quota, + [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ + .type = ST_FS, + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, }, - { /* project quota */ - .setup = xfs_scrub_setup_quota, - .scrub = xfs_scrub_quota, + [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ + .type = ST_FS, + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, }, }; @@ -285,44 +317,56 @@ xfs_scrub_experimental_warning( "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); } -/* Dispatch metadata scrubbing. */ -int -xfs_scrub_metadata( - struct xfs_inode *ip, +static int +xfs_scrub_validate_inputs( + struct xfs_mount *mp, struct xfs_scrub_metadata *sm) { - struct xfs_scrub_context sc; - struct xfs_mount *mp = ip->i_mount; + int error; const struct xfs_scrub_meta_ops *ops; - bool try_harder = false; - int error = 0; - - trace_xfs_scrub_start(ip, sm, error); - - /* Forbidden if we are shut down or mounted norecovery. */ - error = -ESHUTDOWN; - if (XFS_FORCED_SHUTDOWN(mp)) - goto out; - error = -ENOTRECOVERABLE; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) - goto out; - /* Check our inputs. */ error = -EINVAL; + /* Check our inputs. */ sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) goto out; + /* sm_reserved[] must be zero */ if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) goto out; - /* Do we know about this type of metadata? */ error = -ENOENT; + /* Do we know about this type of metadata? */ if (sm->sm_type >= XFS_SCRUB_TYPE_NR) goto out; ops = &meta_scrub_ops[sm->sm_type]; - if (ops->scrub == NULL) + if (ops->setup == NULL || ops->scrub == NULL) + goto out; + /* Does this fs even support this type of metadata? */ + if (ops->has && !ops->has(&mp->m_sb)) + goto out; + + error = -EINVAL; + /* restricting fields must be appropriate for type */ + switch (ops->type) { + case ST_NONE: + case ST_FS: + if (sm->sm_ino || sm->sm_gen || sm->sm_agno) + goto out; + break; + case ST_PERAG: + if (sm->sm_ino || sm->sm_gen || + sm->sm_agno >= mp->m_sb.sb_agcount) + goto out; + break; + case ST_INODE: + if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) + goto out; + break; + default: goto out; + } + error = -EOPNOTSUPP; /* * We won't scrub any filesystem that doesn't have the ability * to record unwritten extents. The option was made default in @@ -332,20 +376,46 @@ xfs_scrub_metadata( * We also don't support v1-v3 filesystems, which aren't * mountable. */ - error = -EOPNOTSUPP; if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) goto out; - /* Does this fs even support this type of metadata? */ - error = -ENOENT; - if (ops->has && !ops->has(&mp->m_sb)) - goto out; - /* We don't know how to repair anything yet. */ - error = -EOPNOTSUPP; if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) goto out; + error = 0; +out: + return error; +} + +/* Dispatch metadata scrubbing. */ +int +xfs_scrub_metadata( + struct xfs_inode *ip, + struct xfs_scrub_metadata *sm) +{ + struct xfs_scrub_context sc; + struct xfs_mount *mp = ip->i_mount; + bool try_harder = false; + int error = 0; + + BUILD_BUG_ON(sizeof(meta_scrub_ops) != + (sizeof(struct xfs_scrub_meta_ops) * XFS_SCRUB_TYPE_NR)); + + trace_xfs_scrub_start(ip, sm, error); + + /* Forbidden if we are shut down or mounted norecovery. */ + error = -ESHUTDOWN; + if (XFS_FORCED_SHUTDOWN(mp)) + goto out; + error = -ENOTRECOVERABLE; + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + goto out; + + error = xfs_scrub_validate_inputs(mp, sm); + if (error) + goto out; + xfs_scrub_experimental_warning(mp); retry_op: @@ -353,7 +423,7 @@ retry_op: memset(&sc, 0, sizeof(sc)); sc.mp = ip->i_mount; sc.sm = sm; - sc.ops = ops; + sc.ops = &meta_scrub_ops[sm->sm_type]; sc.try_harder = try_harder; sc.sa.agno = NULLAGNUMBER; error = sc.ops->setup(&sc, ip); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e9ec041cf713..0d92af86f67a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -22,6 +22,14 @@ struct xfs_scrub_context; +/* Type info and names for the scrub types. */ +enum xfs_scrub_type { + ST_NONE = 1, /* disabled */ + ST_PERAG, /* per-AG metadata */ + ST_FS, /* per-FS metadata */ + ST_INODE, /* per-inode metadata */ +}; + struct xfs_scrub_meta_ops { /* Acquire whatever resources are needed for the operation. */ int (*setup)(struct xfs_scrub_context *, @@ -32,6 +40,9 @@ struct xfs_scrub_meta_ops { /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_sb *); + + /* type describing required/allowed inputs */ + enum xfs_scrub_type type; }; /* Buffer pointers and btree cursors for an entire AG. */ @@ -112,4 +123,30 @@ xfs_scrub_quota(struct xfs_scrub_context *sc) } #endif +/* cross-referencing helpers */ +void xfs_scrub_xref_is_used_space(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_not_inode_chunk(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_inode_chunk(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_owned_by(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); +void xfs_scrub_xref_is_not_owned_by(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); +void xfs_scrub_xref_has_no_owner(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_cow_staging(struct xfs_scrub_context *sc, + xfs_agblock_t bno, xfs_extlen_t len); +void xfs_scrub_xref_is_not_shared(struct xfs_scrub_context *sc, + xfs_agblock_t bno, xfs_extlen_t len); +#ifdef CONFIG_XFS_RT +void xfs_scrub_xref_is_used_rt_space(struct xfs_scrub_context *sc, + xfs_rtblock_t rtbno, xfs_extlen_t len); +#else +# define xfs_scrub_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) +#endif + #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 472080e75788..86daed0e3a45 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -26,7 +26,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c4ebfb5c1ee8..4dc896852bf0 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -50,7 +50,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_class, __entry->flags = sm->sm_flags; __entry->error = error; ), - TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d", + TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->type, @@ -90,7 +90,7 @@ TRACE_EVENT(xfs_scrub_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pF", + TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -121,7 +121,7 @@ TRACE_EVENT(xfs_scrub_file_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu error %d ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -156,7 +156,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_block_error_class, __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class, __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu type %u agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx type %u agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->type, @@ -246,7 +246,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class, __entry->offset = offset; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -277,7 +277,7 @@ TRACE_EVENT(xfs_scrub_incomplete, __entry->type = sc->sm->sm_type; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u ret_ip %pF", + TP_printk("dev %d:%d type %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->ret_ip) @@ -311,7 +311,7 @@ TRACE_EVENT(xfs_scrub_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->btnum, @@ -354,7 +354,7 @@ TRACE_EVENT(xfs_scrub_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -393,7 +393,7 @@ TRACE_EVENT(xfs_scrub_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->btnum, @@ -433,7 +433,7 @@ TRACE_EVENT(xfs_scrub_ifork_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -491,6 +491,28 @@ DEFINE_EVENT(xfs_scrub_sbtree_class, name, \ DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec); DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key); +TRACE_EVENT(xfs_scrub_xref_error, + TP_PROTO(struct xfs_scrub_context *sc, int error, void *ret_ip), + TP_ARGS(sc, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u xref error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->error, + __entry->ret_ip) +); + #endif /* _TRACE_XFS_SCRUB_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a3eeaba156c5..9c6a830da0ee 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -390,6 +390,19 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + /* + * Truncate can race with writeback since writeback doesn't take the + * iolock and truncate decreases the file size before it starts + * truncating the pages between new_size and old_size. Therefore, we + * can end up in the situation where writeback gets a CoW fork mapping + * but the truncate makes the mapping invalid and we end up in here + * trying to get a new mapping. Bail out here so that we simply never + * get a valid mapping and so we drop the write altogether. The page + * truncation will kill the contents anyway. + */ + if (type == XFS_IO_COW && offset > i_size_read(inode)) + return 0; + ASSERT(type != XFS_IO_COW); if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -399,7 +412,7 @@ xfs_map_blocks( (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - count) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -791,7 +804,7 @@ xfs_aops_discard_page( goto out_invalidate; xfs_alert(ip->i_mount, - "page discard on page %p, inode 0x%llx, offset %llu.", + "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -896,13 +909,13 @@ xfs_writepage_map( struct writeback_control *wbc, struct inode *inode, struct page *page, - loff_t offset, - uint64_t end_offset) + uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; ssize_t len = i_blocksize(inode); + uint64_t offset; int error = 0; int count = 0; int uptodate = 1; @@ -1146,7 +1159,7 @@ xfs_do_writepage( end_offset = offset; } - return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); + return xfs_writepage_map(wpc, wbc, inode, page, end_offset); redirty: redirty_page_for_writepage(wbc, page); @@ -1265,7 +1278,7 @@ xfs_map_trim_size( if (mapping_size > size) mapping_size = size; if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { + (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, i_blocksize(inode)); @@ -1312,7 +1325,7 @@ xfs_get_blocks( lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + size > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - size) size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index dd136f7275e4..e5fb008d75e8 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -389,7 +389,8 @@ xfs_bud_init( int xfs_bui_recover( struct xfs_mount *mp, - struct xfs_bui_log_item *buip) + struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops) { int error = 0; unsigned int bui_type; @@ -404,9 +405,7 @@ xfs_bui_recover( xfs_exntst_t state; struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_defer_ops dfops; struct xfs_bmbt_irec irec; - xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -464,7 +463,6 @@ xfs_bui_recover( if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - xfs_defer_init(&dfops, &firstfsb); /* Process deferred bmap item. */ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? @@ -479,16 +477,16 @@ xfs_bui_recover( break; default: error = -EFSCORRUPTED; - goto err_dfops; + goto err_inode; } xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); if (error) - goto err_dfops; + goto err_inode; if (count > 0) { ASSERT(type == XFS_BMAP_UNMAP); @@ -496,16 +494,11 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec); if (error) - goto err_dfops; + goto err_inode; } - /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto err_dfops; - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -513,8 +506,6 @@ xfs_bui_recover( return error; -err_dfops: - xfs_defer_cancel(&dfops); err_inode: xfs_trans_cancel(tp); if (ip) { diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index c867daae4a3c..24b354a2c836 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, struct xfs_bui_log_item *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops); #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6d37ab43195f..c83f549dc17b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1872,7 +1872,7 @@ xfs_swap_extents( */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); lock_flags = XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { @@ -1919,7 +1919,7 @@ xfs_swap_extents( * Lock and join the inodes to the tansaction so that transaction commit * or cancel will unlock the inodes from this point onwards. */ - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); lock_flags |= XFS_ILOCK_EXCL; xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4db6e8d780f6..d1da2ee9e6db 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -236,6 +236,7 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); + INIT_LIST_HEAD(&bp->b_li_list); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -585,7 +586,7 @@ _xfs_buf_find( * returning a specific error on buffer lookup failures. */ xfs_alert(btp->bt_mount, - "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + "%s: daddr 0x%llx out of range, EOFS 0x%llx", __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; @@ -1180,13 +1181,14 @@ xfs_buf_ioend_async( } void -xfs_buf_ioerror( +__xfs_buf_ioerror( xfs_buf_t *bp, - int error) + int error, + xfs_failaddr_t failaddr) { ASSERT(error <= 0 && error >= -1000); bp->b_error = error; - trace_xfs_buf_ioerror(bp, error, _RET_IP_); + trace_xfs_buf_ioerror(bp, error, failaddr); } void @@ -1195,8 +1197,9 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", - (uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); +"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", + func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, + -bp->b_error); } int @@ -1378,9 +1381,10 @@ _xfs_buf_ioapply( */ if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_warn(mp, - "%s: no ops on block 0x%llx/0x%x", + "%s: no buf ops on daddr 0x%llx len %d", __func__, bp->b_bn, bp->b_length); - xfs_hex_dump(bp->b_addr, 64); + xfs_hex_dump(bp->b_addr, + XFS_CORRUPTION_DUMP_LEN); dump_stack(); } } @@ -1671,7 +1675,7 @@ xfs_wait_buftarg( list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { xfs_alert(btp->bt_mount, -"Corruption Alert: Buffer at block 0x%llx had permanent write failures!", +"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)bp->b_bn); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); @@ -1815,22 +1819,27 @@ xfs_alloc_buftarg( btp->bt_daxdev = dax_dev; if (xfs_setsize_buftarg_early(btp, bdev)) - goto error; + goto error_free; if (list_lru_init(&btp->bt_lru)) - goto error; + goto error_free; if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error; + goto error_lru; btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&btp->bt_shrinker); + if (register_shrinker(&btp->bt_shrinker)) + goto error_pcpu; return btp; -error: +error_pcpu: + percpu_counter_destroy(&btp->bt_io_count); +error_lru: + list_lru_destroy(&btp->bt_lru); +error_free: kmem_free(btp); return NULL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index f873bb786824..2f4c91452861 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -140,6 +140,7 @@ struct xfs_buf_ops { char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); + xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); }; typedef struct xfs_buf { @@ -175,7 +176,8 @@ typedef struct xfs_buf { struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ - void *b_fspriv; + void *b_log_item; + struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ @@ -315,7 +317,9 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); -extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, + xfs_failaddr_t failaddr); +#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern void xfs_buf_submit(struct xfs_buf *bp); extern int xfs_buf_submit_wait(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e0a0af0946f2..270ddb4d2313 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -61,14 +61,14 @@ xfs_buf_log_format_size( */ STATIC void xfs_buf_item_size_segment( - struct xfs_buf_log_item *bip, - struct xfs_buf_log_format *blfp, - int *nvecs, - int *nbytes) + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { - struct xfs_buf *bp = bip->bli_buf; - int next_bit; - int last_bit; + struct xfs_buf *bp = bip->bli_buf; + int next_bit; + int last_bit; last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (last_bit == -1) @@ -218,12 +218,12 @@ xfs_buf_item_format_segment( uint offset, struct xfs_buf_log_format *blfp) { - struct xfs_buf *bp = bip->bli_buf; - uint base_size; - int first_bit; - int last_bit; - int next_bit; - uint nbits; + struct xfs_buf *bp = bip->bli_buf; + uint base_size; + int first_bit; + int last_bit; + int next_bit; + uint nbits; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -406,12 +406,12 @@ xfs_buf_item_unpin( int remove) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - xfs_buf_t *bp = bip->bli_buf; - struct xfs_ail *ailp = lip->li_ailp; - int stale = bip->bli_flags & XFS_BLI_STALE; - int freed; + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; + int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; - ASSERT(bp->b_fspriv == bip); + ASSERT(bp->b_log_item == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); @@ -456,13 +456,14 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); - bp->b_fspriv = NULL; + bp->b_log_item = NULL; + list_del_init(&bp->b_li_list); bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - ASSERT(bp->b_fspriv == NULL); + ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); } else if (freed && remove) { @@ -722,18 +723,15 @@ xfs_buf_item_free_format( /* * Allocate a new buf log item to go with the given buffer. - * Set the buffer's b_fsprivate field to point to the new - * buf log item. If there are other item's attached to the - * buffer (see xfs_buf_attach_iodone() below), then put the - * buf log item at the front. + * Set the buffer's b_log_item field to point to the new + * buf log item. */ int xfs_buf_item_init( struct xfs_buf *bp, struct xfs_mount *mp) { - struct xfs_log_item *lip = bp->b_fspriv; - struct xfs_buf_log_item *bip; + struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; int error; @@ -741,13 +739,14 @@ xfs_buf_item_init( /* * Check to see if there is already a buf log item for - * this buffer. If there is, it is guaranteed to be - * the first. If we do already have one, there is + * this buffer. If we do already have one, there is * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (lip != NULL && lip->li_type == XFS_LI_BUF) + if (bip != NULL) { + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); return 0; + } bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); @@ -781,13 +780,7 @@ xfs_buf_item_init( bip->bli_formats[i].blf_map_size = map_size; } - /* - * Put the buf item into the list of items attached to the - * buffer at the front. - */ - if (bp->b_fspriv) - bip->bli_item.li_bio_list = bp->b_fspriv; - bp->b_fspriv = bip; + bp->b_log_item = bip; xfs_buf_hold(bp); return 0; } @@ -880,7 +873,7 @@ xfs_buf_item_log_segment( */ void xfs_buf_item_log( - xfs_buf_log_item_t *bip, + struct xfs_buf_log_item *bip, uint first, uint last) { @@ -943,7 +936,7 @@ xfs_buf_item_dirty_format( STATIC void xfs_buf_item_free( - xfs_buf_log_item_t *bip) + struct xfs_buf_log_item *bip) { xfs_buf_item_free_format(bip); kmem_free(bip->bli_item.li_lv_shadow); @@ -961,13 +954,13 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); - bp->b_fspriv = bip->bli_item.li_bio_list; - if (bp->b_fspriv == NULL) + bp->b_log_item = NULL; + if (list_empty(&bp->b_li_list)) bp->b_iodone = NULL; xfs_buf_rele(bp); @@ -980,9 +973,7 @@ xfs_buf_item_relse( * to be called when the buffer's I/O completes. If it is not set * already, set the buffer's b_iodone() routine to be * xfs_buf_iodone_callbacks() and link the log item into the list of - * items rooted at b_fsprivate. Items are always added as the second - * entry in the list if there is a first, because the buf item code - * assumes that the buf log item is first. + * items rooted at b_li_list. */ void xfs_buf_attach_iodone( @@ -990,18 +981,10 @@ xfs_buf_attach_iodone( void (*cb)(xfs_buf_t *, xfs_log_item_t *), xfs_log_item_t *lip) { - xfs_log_item_t *head_lip; - ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; - head_lip = bp->b_fspriv; - if (head_lip) { - lip->li_bio_list = head_lip->li_bio_list; - head_lip->li_bio_list = lip; - } else { - bp->b_fspriv = lip; - } + list_add_tail(&lip->li_bio_list, &bp->b_li_list); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -1011,12 +994,12 @@ xfs_buf_attach_iodone( /* * We can have many callbacks on a buffer. Running the callbacks individually * can cause a lot of contention on the AIL lock, so we allow for a single - * callback to be able to scan the remaining lip->li_bio_list for other items - * of the same type and callback to be processed in the first call. + * callback to be able to scan the remaining items in bp->b_li_list for other + * items of the same type and callback to be processed in the first call. * * As a result, the loop walking the callback list below will also modify the * list. it removes the first item from the list and then runs the callback. - * The loop then restarts from the new head of the list. This allows the + * The loop then restarts from the new first item int the list. This allows the * callback to scan and modify the list attached to the buffer and we don't * have to care about maintaining a next item pointer. */ @@ -1024,18 +1007,26 @@ STATIC void xfs_buf_do_callbacks( struct xfs_buf *bp) { + struct xfs_buf_log_item *blip = bp->b_log_item; struct xfs_log_item *lip; - while ((lip = bp->b_fspriv) != NULL) { - bp->b_fspriv = lip->li_bio_list; - ASSERT(lip->li_cb != NULL); + /* If there is a buf_log_item attached, run its callback */ + if (blip) { + lip = &blip->bli_item; + lip->li_cb(bp, lip); + } + + while (!list_empty(&bp->b_li_list)) { + lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, + li_bio_list); + /* - * Clear the next pointer so we don't have any + * Remove the item from the list, so we don't have any * confusion if the item is added to another buf. * Don't touch the log item after calling its * callback, because it could have freed itself. */ - lip->li_bio_list = NULL; + list_del_init(&lip->li_bio_list); lip->li_cb(bp, lip); } } @@ -1052,13 +1043,22 @@ STATIC void xfs_buf_do_callbacks_fail( struct xfs_buf *bp) { - struct xfs_log_item *next; - struct xfs_log_item *lip = bp->b_fspriv; - struct xfs_ail *ailp = lip->li_ailp; + struct xfs_log_item *lip; + struct xfs_ail *ailp; + /* + * Buffer log item errors are handled directly by xfs_buf_item_push() + * and xfs_buf_iodone_callback_error, and they have no IO error + * callbacks. Check only for items in b_li_list. + */ + if (list_empty(&bp->b_li_list)) + return; + + lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, + li_bio_list); + ailp = lip->li_ailp; spin_lock(&ailp->xa_lock); - for (; lip; lip = next) { - next = lip->li_bio_list; + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_ops->iop_error) lip->li_ops->iop_error(lip, bp); } @@ -1069,13 +1069,23 @@ static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) { - struct xfs_log_item *lip = bp->b_fspriv; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_log_item *lip; + struct xfs_mount *mp; static ulong lasttime; static xfs_buftarg_t *lasttarg; struct xfs_error_cfg *cfg; /* + * The failed buffer might not have a buf_log_item attached or the + * log_item list might be empty. Get the mp from the available + * xfs_log_item + */ + lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, + li_bio_list); + mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; + + /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ @@ -1183,7 +1193,8 @@ xfs_buf_iodone_callbacks( bp->b_first_retry_time = 0; xfs_buf_do_callbacks(bp); - bp->b_fspriv = NULL; + bp->b_log_item = NULL; + list_del_init(&bp->b_li_list); bp->b_iodone = NULL; xfs_buf_ioend(bp); } @@ -1228,10 +1239,9 @@ xfs_buf_iodone( bool xfs_buf_resubmit_failed_buffers( struct xfs_buf *bp, - struct xfs_log_item *lip, struct list_head *buffer_list) { - struct xfs_log_item *next; + struct xfs_log_item *lip; /* * Clear XFS_LI_FAILED flag from all items before resubmit @@ -1239,10 +1249,8 @@ xfs_buf_resubmit_failed_buffers( * XFS_LI_FAILED set/clear is protected by xa_lock, caller this * function already have it acquired */ - for (; lip; lip = next) { - next = lip->li_bio_list; + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - } /* Add this buffer back to the delayed write list */ return xfs_buf_delwri_queue(bp, buffer_list); diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 9690ce62c9a7..643f53dcfe51 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -50,7 +50,7 @@ struct xfs_buf_log_item; * needed to log buffers. It tracks how many times the lock has been * locked, and which 128 byte chunks of the buffer are dirty. */ -typedef struct xfs_buf_log_item { +struct xfs_buf_log_item { xfs_log_item_t bli_item; /* common item structure */ struct xfs_buf *bli_buf; /* real buffer pointer */ unsigned int bli_flags; /* misc flags */ @@ -59,11 +59,11 @@ typedef struct xfs_buf_log_item { int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ struct xfs_buf_log_format __bli_format; /* embedded in-log header */ -} xfs_buf_log_item_t; +}; int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); -void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), @@ -71,7 +71,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, - struct xfs_log_item *, struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 0c58918bc0ad..b6ae3597bfb0 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -152,7 +152,6 @@ xfs_dir2_block_getdents( struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ - xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_dir2_data_unused_t *dup; /* block unused entry */ char *endptr; /* end of the data entries */ @@ -185,9 +184,8 @@ xfs_dir2_block_getdents( /* * Set up values for the loop. */ - btp = xfs_dir2_block_tail_p(geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); + endptr = xfs_dir3_data_endp(geo, hdr); /* * Loop over the data portion of the block. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d57c2db64e59..43572f8a1b8e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -399,52 +399,6 @@ error0: return error; } -STATIC int -xfs_qm_dqrepair( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_dquot *dqp, - xfs_dqid_t firstid, - struct xfs_buf **bpp) -{ - int error; - struct xfs_disk_dquot *ddq; - struct xfs_dqblk *d; - int i; - - /* - * Read the buffer without verification so we get the corrupted - * buffer returned to us. make sure we verify it on write, though. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0, bpp, NULL); - - if (error) { - ASSERT(*bpp == NULL); - return error; - } - (*bpp)->b_ops = &xfs_dquot_buf_ops; - - ASSERT(xfs_buf_islocked(*bpp)); - d = (struct xfs_dqblk *)(*bpp)->b_addr; - - /* Do the actual repair of dquots in this buffer */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - ddq = &d[i].dd_diskdq; - error = xfs_dqcheck(mp, ddq, firstid + i, - dqp->dq_flags & XFS_DQ_ALLTYPES, - XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); - if (error) { - /* repair failed, we're screwed */ - xfs_trans_brelse(tp, *bpp); - return -EIO; - } - } - - return 0; -} - /* * Maps a dquot to the buffer containing its on-disk version. * This returns a ptr to the buffer containing the on-disk dquot @@ -526,14 +480,6 @@ xfs_qm_dqtobp( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - - if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { - xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * - mp->m_quotainfo->qi_dqperchunk; - ASSERT(bp == NULL); - error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); - } - if (error) { ASSERT(bp == NULL); return error; @@ -970,14 +916,22 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if ((lip->li_flags & XFS_LI_IN_AIL) && - lip->li_lsn == qip->qli_flush_lsn) { + ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_flags & XFS_LI_FAILED))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - if (lip->li_lsn == qip->qli_flush_lsn) + if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); - else + } else { + /* + * Clear the failed state since we are about to drop the + * flush lock + */ + if (lip->li_flags & XFS_LI_FAILED) + xfs_clear_li_failed(lip); spin_unlock(&ailp->xa_lock); + } } /* @@ -1002,6 +956,7 @@ xfs_qm_dqflush( struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; struct xfs_disk_dquot *ddqp; + xfs_failaddr_t fa; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -1048,9 +1003,10 @@ xfs_qm_dqflush( /* * A simple sanity check in case we got a corrupted dquot.. */ - error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, - XFS_QMOPT_DOWARN, "dqflush (incore copy)"); - if (error) { + fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", + be32_to_cpu(ddqp->d_id), fa); xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 2c7a1629e064..96eaa6933709 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -137,6 +137,23 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * this informs the AIL that the dquot is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_dquot_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -144,13 +161,28 @@ xfs_qm_dquot_logitem_push( __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -177,7 +209,7 @@ xfs_qm_dquot_logitem_push( error = xfs_qm_dqflush(dqp, &bp); if (error) { - xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT, __func__, error, dqp); } else { if (!xfs_buf_delwri_queue(bp, buffer_list)) @@ -242,7 +274,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_committing = xfs_qm_dquot_logitem_committing + .iop_committing = xfs_qm_dquot_logitem_committing, + .iop_error = xfs_dquot_item_error }; /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 4c9f35d983b2..ccf520f0b00d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -24,6 +24,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" +#include "xfs_inode.h" #ifdef DEBUG @@ -314,12 +315,12 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - void *ra) + xfs_failaddr_t failaddr) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, "Internal error %s at line %d of file %s. Caller %pS", - tag, linenum, filename, ra); + tag, linenum, filename, failaddr); xfs_stack_trace(); } @@ -333,11 +334,11 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - void *ra) + xfs_failaddr_t failaddr) { if (level <= xfs_error_level) - xfs_hex_dump(p, 64); - xfs_error_report(tag, level, mp, filename, linenum, ra); + xfs_hex_dump(p, XFS_CORRUPTION_DUMP_LEN); + xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } @@ -347,19 +348,62 @@ xfs_corruption_error( */ void xfs_verifier_error( - struct xfs_buf *bp) + struct xfs_buf *bp, + int error, + xfs_failaddr_t failaddr) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; + + fa = failaddr ? failaddr : __return_address; + __xfs_buf_ioerror(bp, error, fa); xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); if (xfs_error_level >= XFS_ERRLEVEL_LOW) { - xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); - xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", + XFS_CORRUPTION_DUMP_LEN); + xfs_hex_dump(xfs_buf_offset(bp, 0), XFS_CORRUPTION_DUMP_LEN); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} + +/* + * Warnings for inode corruption problems. Don't bother with the stack + * trace unless the error level is turned up high. + */ +void +xfs_inode_verifier_error( + struct xfs_inode *ip, + int error, + const char *name, + void *buf, + size_t bufsz, + xfs_failaddr_t failaddr) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_failaddr_t fa; + int sz; + + fa = failaddr ? failaddr : __return_address; + + xfs_alert(mp, "Metadata %s detected at %pS, inode 0x%llx %s", + error == -EFSBADCRC ? "CRC error" : "corruption", + fa, ip->i_ino, name); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (buf && xfs_error_level >= XFS_ERRLEVEL_LOW) { + sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz); + xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", + sz); + xfs_hex_dump(buf, sz); } if (xfs_error_level >= XFS_ERRLEVEL_HIGH) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index ea816c1bf8db..7e728c5a46b8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,11 +21,16 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, void *ra); + const char *filename, int linenum, + xfs_failaddr_t failaddr); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, void *ra); -extern void xfs_verifier_error(struct xfs_buf *bp); + int linenum, xfs_failaddr_t failaddr); +extern void xfs_verifier_error(struct xfs_buf *bp, int error, + xfs_failaddr_t failaddr); +extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, + const char *name, void *buf, size_t bufsz, + xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -37,6 +42,9 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRLEVEL_LOW 1 #define XFS_ERRLEVEL_HIGH 5 +/* Dump 128 bytes of any corrupt buffer */ +#define XFS_CORRUPTION_DUMP_LEN (128) + /* * Macros to set EFSCORRUPTED & return/branch. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 44f8c5451210..64da90655e95 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -538,7 +538,7 @@ xfs_efi_recover( return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xfs_rmap_skip_owner_update(&oinfo); + xfs_rmap_any_owner_update(&oinfo); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8f22fc579dbb..8b4545623e25 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -49,83 +49,6 @@ * File system operations */ -int -xfs_fs_geometry( - xfs_mount_t *mp, - xfs_fsop_geom_t *geo, - int new_version) -{ - - memset(geo, 0, sizeof(*geo)); - - geo->blocksize = mp->m_sb.sb_blocksize; - geo->rtextsize = mp->m_sb.sb_rextsize; - geo->agblocks = mp->m_sb.sb_agblocks; - geo->agcount = mp->m_sb.sb_agcount; - geo->logblocks = mp->m_sb.sb_logblocks; - geo->sectsize = mp->m_sb.sb_sectsize; - geo->inodesize = mp->m_sb.sb_inodesize; - geo->imaxpct = mp->m_sb.sb_imax_pct; - geo->datablocks = mp->m_sb.sb_dblocks; - geo->rtblocks = mp->m_sb.sb_rblocks; - geo->rtextents = mp->m_sb.sb_rextents; - geo->logstart = mp->m_sb.sb_logstart; - ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid)); - memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid)); - if (new_version >= 2) { - geo->sunit = mp->m_sb.sb_unit; - geo->swidth = mp->m_sb.sb_width; - } - if (new_version >= 3) { - geo->version = XFS_FSOP_GEOM_VERSION; - geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | - XFS_FSOP_GEOM_FLAGS_DIRV2 | - (xfs_sb_version_hasattr(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (xfs_sb_version_hasquota(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | - (xfs_sb_version_hasalign(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | - (xfs_sb_version_hasdalign(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (xfs_sb_version_hasextflgbit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (xfs_sb_version_hassector(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | - (xfs_sb_version_hasasciici(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | - (xfs_sb_version_haslazysbcount(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | - (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | - (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | - (xfs_sb_version_hascrc(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_V5SB : 0) | - (xfs_sb_version_hasftype(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | - (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | - (xfs_sb_version_hassparseinodes(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | - (xfs_sb_version_hasrmapbt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | - (xfs_sb_version_hasreflink(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_REFLINK : 0); - geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? - mp->m_sb.sb_logsectsize : BBSIZE; - geo->rtsectsize = mp->m_sb.sb_blocksize; - geo->dirblocksize = mp->m_dir_geo->blksize; - } - if (new_version >= 4) { - geo->flags |= - (xfs_sb_version_haslogv2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); - geo->logsunit = mp->m_sb.sb_logsunit; - } - return 0; -} - static struct xfs_buf * xfs_growfs_get_hdr_buf( struct xfs_mount *mp, @@ -571,6 +494,11 @@ xfs_growfs_data_private( * this doesn't actually exist in the rmap btree. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, agno, + be32_to_cpu(agf->agf_length) - new, + new, &oinfo); + if (error) + goto error0; error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), @@ -950,7 +878,7 @@ xfs_do_force_shutdown( if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, __func__, flags, lnnum, fname, __return_address); } /* diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2954c13a3acd..20484ed5e919 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -18,7 +18,6 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 43005fbe8b1e..d53a316162d6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,6 +37,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/iversion.h> /* * Allocate and initialise an xfs_inode. @@ -293,15 +294,17 @@ xfs_reinit_inode( int error; uint32_t nlink = inode->i_nlink; uint32_t generation = inode->i_generation; - uint64_t version = inode->i_version; + uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; + dev_t dev = inode->i_rdev; error = inode_init_always(mp->m_super, inode); set_nlink(inode, nlink); inode->i_generation = generation; - inode->i_version = version; + inode_set_iversion_queried(inode, version); inode->i_mode = mode; + inode->i_rdev = dev; return error; } @@ -473,6 +476,11 @@ xfs_iget_cache_miss( if (error) goto out_destroy; + if (!xfs_inode_verify_forks(ip)) { + error = -EFSCORRUPTED; + goto out_destroy; + } + trace_xfs_iget_miss(ip); if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { @@ -870,7 +878,7 @@ xfs_eofblocks_worker( * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). * (We'll just piggyback on the post-EOF prealloc space workqueue.) */ -STATIC void +void xfs_queue_cowblocks( struct xfs_mount *mp) { @@ -1536,8 +1544,23 @@ xfs_inode_free_quota_eofblocks( return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); } +static inline unsigned long +xfs_iflag_for_tag( + int tag) +{ + switch (tag) { + case XFS_ICI_EOFBLOCKS_TAG: + return XFS_IEOFBLOCKS; + case XFS_ICI_COWBLOCKS_TAG: + return XFS_ICOWBLOCKS; + default: + ASSERT(0); + return 0; + } +} + static void -__xfs_inode_set_eofblocks_tag( +__xfs_inode_set_blocks_tag( xfs_inode_t *ip, void (*execute)(struct xfs_mount *mp), void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1552,10 +1575,10 @@ __xfs_inode_set_eofblocks_tag( * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & XFS_IEOFBLOCKS) + if (ip->i_flags & xfs_iflag_for_tag(tag)) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= XFS_IEOFBLOCKS; + ip->i_flags |= xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1587,13 +1610,13 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, trace_xfs_perag_set_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } static void -__xfs_inode_clear_eofblocks_tag( +__xfs_inode_clear_blocks_tag( xfs_inode_t *ip, void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, int error, unsigned long caller_ip), @@ -1603,7 +1626,7 @@ __xfs_inode_clear_eofblocks_tag( struct xfs_perag *pag; spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~XFS_IEOFBLOCKS; + ip->i_flags &= ~xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1630,33 +1653,20 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } /* - * Automatic CoW Reservation Freeing - * - * These functions automatically garbage collect leftover CoW reservations - * that were made on behalf of a cowextsize hint when we start to run out - * of quota or when the reservations sit around for too long. If the file - * has dirty pages or is undergoing writeback, its CoW reservations will - * be retained. - * - * The actual garbage collection piggybacks off the same code that runs - * the speculative EOF preallocation garbage collector. + * Set ourselves up to free CoW blocks from this file. If it's already clean + * then we can bail out quickly, but otherwise we must back off if the file + * is undergoing some kind of write. */ -STATIC int -xfs_inode_free_cowblocks( +static bool +xfs_prep_free_cowblocks( struct xfs_inode *ip, - int flags, - void *args) + struct xfs_ifork *ifp) { - int ret; - struct xfs_eofblocks *eofb = args; - int match; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1664,7 +1674,7 @@ xfs_inode_free_cowblocks( if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); - return 0; + return false; } /* @@ -1675,6 +1685,35 @@ xfs_inode_free_cowblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&VFS_I(ip)->i_dio_count)) + return false; + + return true; +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long. If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + struct xfs_eofblocks *eofb = args; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + int match; + int ret = 0; + + if (!xfs_prep_free_cowblocks(ip, ifp)) return 0; if (eofb) { @@ -1695,7 +1734,12 @@ xfs_inode_free_cowblocks( xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); + /* + * Check again, nobody else should be able to dirty blocks or change + * the reflink iflag now that we have the first two locks held. + */ + if (xfs_prep_free_cowblocks(ip, ifp)) + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -1724,7 +1768,7 @@ xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, trace_xfs_perag_set_cowblocks, XFS_ICI_COWBLOCKS_TAG); } @@ -1734,6 +1778,6 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index bff4d85e5498..d4a77588eca1 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); +void xfs_queue_cowblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 61d1cb7dc10d..604ee384a00a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/log2.h> +#include <linux/iversion.h> #include "xfs.h" #include "xfs_fs.h" @@ -546,23 +547,36 @@ again: /* * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - - * the iolock, the mmaplock or the ilock, but not more than one at a time. If we - * lock more than one at a time, lockdep will report false positives saying we - * have violated locking orders. + * the mmaplock or the ilock, but not more than one type at a time. If we lock + * more than one at a time, lockdep will report false positives saying we have + * violated locking orders. The iolock must be double-locked separately since + * we use i_rwsem for that. We now support taking one lock EXCL and the other + * SHARED. */ void xfs_lock_two_inodes( - xfs_inode_t *ip0, - xfs_inode_t *ip1, - uint lock_mode) + struct xfs_inode *ip0, + uint ip0_mode, + struct xfs_inode *ip1, + uint ip1_mode) { - xfs_inode_t *temp; + struct xfs_inode *temp; + uint mode_temp; int attempts = 0; xfs_log_item_t *lp; - ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); - if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(hweight32(ip0_mode) == 1); + ASSERT(hweight32(ip1_mode) == 1); + ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -570,10 +584,13 @@ xfs_lock_two_inodes( temp = ip0; ip0 = ip1; ip1 = temp; + mode_temp = ip0_mode; + ip0_mode = ip1_mode; + ip1_mode = mode_temp; } again: - xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); /* * If the first lock we have locked is in the AIL, we must TRY to get @@ -582,18 +599,17 @@ xfs_lock_two_inodes( */ lp = (xfs_log_item_t *)ip0->i_itemp; if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { - xfs_iunlock(ip0, lock_mode); + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { + xfs_iunlock(ip0, ip0_mode); if ((++attempts % 5) == 0) delay(1); /* Don't just spin the CPU */ goto again; } } else { - xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); } } - void __xfs_iflock( struct xfs_inode *ip) @@ -749,7 +765,6 @@ xfs_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, - int okalloc, xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { @@ -765,7 +780,7 @@ xfs_ialloc( * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, ialloc_context, &ino); if (error) return error; @@ -833,7 +848,7 @@ xfs_ialloc( ip->i_d.di_flags = 0; if (ip->i_d.di_version == 3) { - inode->i_version = 1; + inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; @@ -957,7 +972,6 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be locked. */ int *committed) @@ -988,8 +1002,8 @@ xfs_dir_ialloc( * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, + &ip); /* * Return an error if we were unable to allocate a new inode. @@ -1061,7 +1075,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); + &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -1182,11 +1196,6 @@ xfs_create( xfs_flush_inodes(mp); error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1203,19 +1212,13 @@ xfs_create( if (error) goto out_trans_cancel; - if (!resblks) { - error = xfs_dir_canenter(tp, dp, name); - if (error) - goto out_trans_cancel; - } - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, + NULL); if (error) goto out_trans_cancel; @@ -1340,11 +1343,6 @@ xfs_create_tmpfile( tres = &M_RES(mp)->tr_create_tmpfile; error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1353,8 +1351,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); if (error) goto out_trans_cancel; @@ -1440,7 +1437,7 @@ xfs_link( if (error) goto std_return; - xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); @@ -1506,6 +1503,24 @@ xfs_link( return error; } +/* Clear the reflink flag and the cowblocks tag if possible. */ +static void +xfs_itruncate_clear_reflink_flags( + struct xfs_inode *ip) +{ + struct xfs_ifork *dfork; + struct xfs_ifork *cfork; + + if (!xfs_is_reflink_inode(ip)) + return; + dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + if (dfork->if_bytes == 0 && cfork->if_bytes == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (cfork->if_bytes == 0) + xfs_inode_clear_cowblocks_tag(ip); +} + /* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and @@ -1602,15 +1617,7 @@ xfs_itruncate_extents( if (error) goto out; - /* - * Clear the reflink flag if there are no data fork blocks and - * there are no extents staged in the cow fork. - */ - if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { - if (ip->i_d.di_nblocks == 0) - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; - xfs_inode_clear_cowblocks_tag(ip); - } + xfs_itruncate_clear_reflink_flags(ip); /* * Always re-log the inode so that our permanent transaction can keep @@ -2223,7 +2230,7 @@ xfs_ifree_cluster( xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; - xfs_log_item_t *lip; + struct xfs_log_item *lip; struct xfs_perag *pag; xfs_ino_t inum; @@ -2281,8 +2288,7 @@ xfs_ifree_cluster( * stale first, we will not attempt to lock them in the loop * below as the XFS_ISTALE flag will be set. */ - lip = bp->b_fspriv; - while (lip) { + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { iip = (xfs_inode_log_item_t *)lip; ASSERT(iip->ili_logged == 1); @@ -2292,7 +2298,6 @@ xfs_ifree_cluster( &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); } - lip = lip->li_bio_list; } @@ -2401,6 +2406,24 @@ retry: } /* + * Free any local-format buffers sitting around before we reset to + * extents format. + */ +static inline void +xfs_ifree_local_data( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); +} + +/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2437,8 +2460,12 @@ xfs_ifree( if (error) return error; + xfs_ifree_local_data(ip, XFS_DATA_FORK); + xfs_ifree_local_data(ip, XFS_ATTR_FORK); + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; + ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -2574,7 +2601,7 @@ xfs_remove( goto std_return; } - xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -3467,6 +3494,36 @@ abort_out: return error; } +/* + * If there are inline format data / attr forks attached to this inode, + * make sure they're not corrupt. + */ +bool +xfs_inode_verify_forks( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp; + xfs_failaddr_t fa; + + fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); + if (fa) { + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", + ifp->if_u1.if_data, ifp->if_bytes, fa); + return false; + } + + fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); + if (fa) { + ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", + ifp ? ifp->if_u1.if_data : NULL, + ifp ? ifp->if_bytes : 0, fa); + return false; + } + return true; +} + STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -3489,7 +3546,7 @@ xfs_iflush_int( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", + "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto corrupt_out; } @@ -3499,7 +3556,7 @@ xfs_iflush_int( (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr 0x%p", + "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto corrupt_out; } @@ -3510,7 +3567,7 @@ xfs_iflush_int( (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr 0x%p", + "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto corrupt_out; } @@ -3519,7 +3576,7 @@ xfs_iflush_int( ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " - "total extents = %d, nblocks = %Ld, ptr 0x%p", + "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_nextents + ip->i_d.di_anextents, ip->i_d.di_nblocks, ip); @@ -3528,7 +3585,7 @@ xfs_iflush_int( if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } @@ -3545,10 +3602,8 @@ xfs_iflush_int( if (ip->i_d.di_version < 3) ip->i_d.di_flushiter++; - /* Check the inline directory data. */ - if (S_ISDIR(VFS_I(ip)->i_mode) && - ip->i_d.di_format == XFS_DINODE_FMT_LOCAL && - xfs_dir2_sf_verify(ip)) + /* Check the inline fork data before we write out. */ + if (!xfs_inode_verify_forks(ip)) goto corrupt_out; /* @@ -3611,7 +3666,7 @@ xfs_iflush_int( /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); - ASSERT(bp->b_fspriv != NULL); + ASSERT(!list_empty(&bp->b_li_list)); ASSERT(bp->b_iodone != NULL); return 0; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index cc13c3763721..3e8dc990d41c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * log recovery to replay a bmap operation on the inode. */ #define XFS_IRECOVERY (1 << 11) +#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -422,13 +423,14 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); +void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, + struct xfs_inode *ip1, uint ip1_mode); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, - xfs_nlink_t, dev_t, prid_t, int, + xfs_nlink_t, dev_t, prid_t, struct xfs_inode **, int *); /* from xfs_file.c */ @@ -490,4 +492,6 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 +bool xfs_inode_verify_forks(struct xfs_inode *ip); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6ee5c3bf19ad..d5037f060d6f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -30,6 +30,7 @@ #include "xfs_buf_item.h" #include "xfs_log.h" +#include <linux/iversion.h> kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -354,7 +355,7 @@ xfs_inode_to_log_dinode( to->di_next_unlinked = NULLAGINO; if (from->di_version == 3) { - to->di_changecount = inode->i_version; + to->di_changecount = inode_peek_iversion(inode); to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; to->di_flags2 = from->di_flags2; @@ -521,7 +522,7 @@ xfs_inode_item_push( if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -712,37 +713,23 @@ xfs_iflush_done( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip; - struct xfs_log_item *blip; - struct xfs_log_item *next; - struct xfs_log_item *prev; + struct xfs_log_item *blip, *n; struct xfs_ail *ailp = lip->li_ailp; int need_ail = 0; + LIST_HEAD(tmp); /* * Scan the buffer IO completions for other inodes being completed and * attach them to the current inode log item. */ - blip = bp->b_fspriv; - prev = NULL; - while (blip != NULL) { - if (blip->li_cb != xfs_iflush_done) { - prev = blip; - blip = blip->li_bio_list; - continue; - } - /* remove from list */ - next = blip->li_bio_list; - if (!prev) { - bp->b_fspriv = next; - } else { - prev->li_bio_list = next; - } + list_add_tail(&lip->li_bio_list, &tmp); - /* add to current list */ - blip->li_bio_list = lip->li_bio_list; - lip->li_bio_list = blip; + list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) { + if (lip->li_cb != xfs_iflush_done) + continue; + list_move_tail(&blip->li_bio_list, &tmp); /* * while we have the item, do the unlocked check for needing * the AIL lock. @@ -751,8 +738,6 @@ xfs_iflush_done( if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || (blip->li_flags & XFS_LI_FAILED)) need_ail++; - - blip = next; } /* make sure we capture the state of the initial inode. */ @@ -775,7 +760,7 @@ xfs_iflush_done( /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->xa_lock); - for (blip = lip; blip; blip = blip->li_bio_list) { + list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) mlip_changed |= xfs_ail_delete_one(ailp, blip); @@ -801,15 +786,14 @@ xfs_iflush_done( * ili_last_fields bits now that we know that the data corresponding to * them is safely on disk. */ - for (blip = lip; blip; blip = next) { - next = blip->li_bio_list; - blip->li_bio_list = NULL; - + list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { + list_del_init(&blip->li_bio_list); iip = INODE_ITEM(blip); iip->ili_logged = 0; iip->ili_last_fields = 0; xfs_ifunlock(iip->ili_inode); } + list_del(&tmp); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 20dc65fef6a4..89fb1eb80aae 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -45,6 +45,7 @@ #include <linux/fsmap.h> #include "xfs_fsmap.h" #include "scrub/xfs_scrub.h" +#include "xfs_sb.h" #include <linux/capability.h> #include <linux/cred.h> @@ -809,7 +810,7 @@ xfs_ioc_fsgeometry_v1( xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, &fsgeo, 3); + error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); if (error) return error; @@ -831,7 +832,7 @@ xfs_ioc_fsgeometry( xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, &fsgeo, 4); + error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4); if (error) return error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 35c79e246fde..10fbde359649 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -37,6 +37,7 @@ #include "xfs_ioctl.h" #include "xfs_ioctl32.h" #include "xfs_trace.h" +#include "xfs_sb.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) @@ -66,7 +67,7 @@ xfs_compat_ioc_fsgeometry_v1( xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, &fsgeo, 3); + error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); if (error) return error; /* The 32-bit variant simply has some padding at the end */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 33eb4fb2e3fd..66e1edbfb2b2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1006,7 +1006,7 @@ xfs_file_iomap_begin( } ASSERT(offset <= mp->m_super->s_maxbytes); - if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - length) length = mp->m_super->s_maxbytes - offset; offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); @@ -1213,7 +1213,7 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); + &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: xfs_iunlock(ip, lockmode); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 99562ec0de56..bee51a14a906 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -285,8 +285,22 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #define XFS_IS_REALTIME_INODE(ip) \ (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ (ip)->i_mount->m_rtdev_targp) +#define XFS_IS_REALTIME_MOUNT(mp) ((mp)->m_rtdev_targp ? 1 : 0) #else #define XFS_IS_REALTIME_INODE(ip) (0) +#define XFS_IS_REALTIME_MOUNT(mp) (0) +#endif + +/* + * Starting in Linux 4.15, the %p (raw pointer value) printk modifier + * prints a hashed version of the pointer to avoid leaking kernel + * pointers into dmesg. If we're trying to debug the kernel we want the + * raw values, so override this behavior as best we can. + */ +#ifdef DEBUG +# define PTR_FMT "%px" +#else +# define PTR_FMT "%p" #endif #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 38d4227895ae..3e5ba1ecc080 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -781,17 +781,17 @@ xfs_log_mount_finish( * something to an unlinked inode, the irele won't cause * premature truncation and freeing of the inode, which results * in log recovery failure. We have to evict the unreferenced - * lru inodes after clearing MS_ACTIVE because we don't + * lru inodes after clearing SB_ACTIVE because we don't * otherwise clean up the lru if there's a subsequent failure in * xfs_mountfs, which leads to us leaking the inodes if nothing * else (e.g. quotacheck) references the inodes before the * mount failure occurs. */ - mp->m_super->s_flags |= MS_ACTIVE; + mp->m_super->s_flags |= SB_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); - mp->m_super->s_flags &= ~MS_ACTIVE; + mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); /* @@ -1047,6 +1047,7 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_ail); INIT_LIST_HEAD(&item->li_cil); + INIT_LIST_HEAD(&item->li_bio_list); } /* @@ -1242,7 +1243,7 @@ xlog_space_left( static void xlog_iodone(xfs_buf_t *bp) { - struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog_in_core *iclog = bp->b_log_item; struct xlog *l = iclog->ic_log; int aborted = 0; @@ -1773,7 +1774,7 @@ STATIC int xlog_bdstrat( struct xfs_buf *bp) { - struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog_in_core *iclog = bp->b_log_item; xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { @@ -1919,7 +1920,7 @@ xlog_sync( } bp->b_io_length = BTOBB(count); - bp->b_fspriv = iclog; + bp->b_log_item = iclog; bp->b_flags &= ~XBF_FLUSH; bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); @@ -1958,7 +1959,7 @@ xlog_sync( XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); - bp->b_fspriv = iclog; + bp->b_log_item = iclog; bp->b_flags &= ~XBF_FLUSH; bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); @@ -2117,7 +2118,9 @@ xlog_print_trans( /* dump core transaction and ticket info */ xfs_warn(mp, "transaction summary:"); - xfs_warn(mp, " flags = 0x%x", tp->t_flags); + xfs_warn(mp, " log res = %d", tp->t_log_res); + xfs_warn(mp, " log count = %d", tp->t_log_count); + xfs_warn(mp, " flags = 0x%x", tp->t_flags); xlog_print_tic_res(mp, tp->t_ticket); @@ -2242,7 +2245,7 @@ xlog_write_setup_ophdr( break; default: xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", + "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, ophdr->oh_clientid, ticket); return NULL; } @@ -3924,7 +3927,7 @@ xlog_verify_iclog( } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) xfs_warn(log->l_mp, - "%s: invalid clientid %d op 0x%p offset 0x%lx", + "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", __func__, clientid, ophead, (unsigned long)field_offset); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 87b1c331f9eb..00240c9ee72e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -399,9 +400,9 @@ xlog_recover_iodone( * On v5 supers, a bli could be attached to update the metadata LSN. * Clean it up. */ - if (bp->b_fspriv) + if (bp->b_log_item) xfs_buf_item_relse(bp); - ASSERT(bp->b_fspriv == NULL); + ASSERT(bp->b_log_item == NULL); bp->b_iodone = NULL; xfs_buf_ioend(bp); @@ -2217,7 +2218,7 @@ xlog_recover_do_inode_buffer( next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { xfs_alert(mp, - "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", @@ -2629,7 +2630,7 @@ xlog_recover_validate_buf_type( ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); bp->b_iodone = xlog_recover_iodone; xfs_buf_item_init(bp, mp); - bip = bp->b_fspriv; + bip = bp->b_log_item; bip->bli_item.li_lsn = current_lsn; } } @@ -2651,7 +2652,7 @@ xlog_recover_do_reg_buffer( int i; int bit; int nbits; - int error; + xfs_failaddr_t fa; trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); @@ -2686,7 +2687,7 @@ xlog_recover_do_reg_buffer( * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. */ - error = 0; + fa = NULL; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { @@ -2700,11 +2701,14 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, - -1, 0, XFS_QMOPT_DOWARN, - "dquot_buf_recover"); - if (error) + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, + -1, 0, 0); + if (fa) { + xfs_alert(mp, + "dquot corrupt at %pS trying to replay into block 0x%llx", + fa, bp->b_bn); goto next; + } } memcpy(xfs_buf_offset(bp, @@ -2956,6 +2960,10 @@ xfs_recover_inode_owner_change( if (error) goto out_free_ip; + if (!xfs_inode_verify_forks(ip)) { + error = -EFSCORRUPTED; + goto out_free_ip; + } if (in_f->ilf_fields & XFS_ILOG_DOWNER) { ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); @@ -3041,7 +3049,7 @@ xlog_recover_inode_pass2( */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); @@ -3051,7 +3059,7 @@ xlog_recover_inode_pass2( ldip = item->ri_buf[1].i_addr; if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); @@ -3109,8 +3117,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr 0x%p, " - "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + "%s: Bad regular inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -3122,8 +3130,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr 0x%p, " - "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + "%s: Bad dir inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -3133,8 +3141,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " - "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", __func__, item, dip, bp, in_f->ilf_ino, ldip->di_nextents + ldip->di_anextents, ldip->di_nblocks); @@ -3145,8 +3153,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " - "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; @@ -3156,7 +3164,7 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr 0x%p", + "%s: Bad inode log record length %d, rec ptr "PTR_FMT, __func__, item->ri_buf[1].i_len, item); error = -EFSCORRUPTED; goto out_release; @@ -3302,6 +3310,7 @@ xlog_recover_dquot_pass2( xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; + xfs_failaddr_t fa; int error; xfs_dq_logformat_t *dq_f; uint type; @@ -3344,10 +3353,12 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_dquot_pass2 (log copy)"); - if (error) + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", + dq_f->qlf_id, fa); return -EIO; + } ASSERT(dq_f->qlf_len == 1); /* @@ -4716,7 +4727,8 @@ STATIC int xlog_recover_process_cui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_cui_log_item *cuip; int error; @@ -4729,7 +4741,7 @@ xlog_recover_process_cui( return 0; spin_unlock(&ailp->xa_lock); - error = xfs_cui_recover(mp, cuip); + error = xfs_cui_recover(mp, cuip, dfops); spin_lock(&ailp->xa_lock); return error; @@ -4756,7 +4768,8 @@ STATIC int xlog_recover_process_bui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_bui_log_item *buip; int error; @@ -4769,7 +4782,7 @@ xlog_recover_process_bui( return 0; spin_unlock(&ailp->xa_lock); - error = xfs_bui_recover(mp, buip); + error = xfs_bui_recover(mp, buip, dfops); spin_lock(&ailp->xa_lock); return error; @@ -4805,6 +4818,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) } } +/* Take all the collected deferred ops and finish them in order. */ +static int +xlog_finish_defer_ops( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops) +{ + struct xfs_trans *tp; + int64_t freeblks; + uint resblks; + int error; + + /* + * We're finishing the defer_ops that accumulated as a result of + * recovering unfinished intent items during log recovery. We + * reserve an itruncate transaction because it is the largest + * permanent transaction type. Since we're the only user of the fs + * right now, take 93% (15/16) of the available free blocks. Use + * weird math to avoid a 64-bit division. + */ + freeblks = percpu_counter_sum(&mp->m_fdblocks); + if (freeblks <= 0) + return -ENOSPC; + resblks = min_t(int64_t, UINT_MAX, freeblks); + resblks = (resblks * 15) >> 4; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + error = xfs_defer_finish(&tp, dfops); + if (error) + goto out_cancel; + + return xfs_trans_commit(tp); + +out_cancel: + xfs_trans_cancel(tp); + return error; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4825,10 +4878,12 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_log_item *lip; - int error = 0; + struct xfs_defer_ops dfops; struct xfs_ail_cursor cur; + struct xfs_log_item *lip; struct xfs_ail *ailp; + xfs_fsblock_t firstfsb; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif @@ -4839,6 +4894,7 @@ xlog_recover_process_intents( #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif + xfs_defer_init(&dfops, &firstfsb); while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -4859,6 +4915,12 @@ xlog_recover_process_intents( */ ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + /* + * NOTE: If your intent processing routine can create more + * deferred ops, you /must/ attach them to the dfops in this + * routine or else those subsequent intents will get + * replayed in the wrong order! + */ switch (lip->li_type) { case XFS_LI_EFI: error = xlog_recover_process_efi(log->l_mp, ailp, lip); @@ -4867,10 +4929,12 @@ xlog_recover_process_intents( error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; case XFS_LI_CUI: - error = xlog_recover_process_cui(log->l_mp, ailp, lip); + error = xlog_recover_process_cui(log->l_mp, ailp, lip, + &dfops); break; case XFS_LI_BUI: - error = xlog_recover_process_bui(log->l_mp, ailp, lip); + error = xlog_recover_process_bui(log->l_mp, ailp, lip, + &dfops); break; } if (error) @@ -4880,6 +4944,11 @@ xlog_recover_process_intents( out: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); + if (error) + xfs_defer_cancel(&dfops); + else + error = xlog_finish_defer_ops(log->l_mp, &dfops); + return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c879b517cc94..98fd41cbb9e1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -162,6 +162,7 @@ xfs_free_perag( ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); xfs_buf_hash_destroy(pag); + mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -248,6 +249,7 @@ xfs_initialize_perag( out_hash_destroy: xfs_buf_hash_destroy(pag); out_free_pag: + mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ @@ -256,6 +258,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 010a13a201aa..5b848f4b637f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -48,7 +48,7 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); - +STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -162,7 +162,7 @@ xfs_qm_dqpurge( */ error = xfs_qm_dqflush(dqp, &bp); if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", + xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed", __func__, dqp); } else { error = xfs_bwrite(bp); @@ -291,8 +291,7 @@ xfs_qm_dqattach_one( * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, - doalloc | XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp); if (error) return error; @@ -481,7 +480,7 @@ xfs_qm_dquot_isolate( error = xfs_qm_dqflush(dqp, &bp); if (error) { - xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed", __func__, dqp); goto out_unlock_dirty; } @@ -574,7 +573,7 @@ xfs_qm_set_defquota( struct xfs_def_quota *defq; int error; - error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqread(mp, 0, type, 0, &dqp); if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; @@ -652,7 +651,7 @@ xfs_qm_init_quotainfo( XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : XFS_DQ_PROJ), - XFS_QMOPT_DOWARN, &dqp); + 0, &dqp); if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; @@ -695,9 +694,17 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&qinf->qi_shrinker); + + error = register_shrinker(&qinf->qi_shrinker); + if (error) + goto out_free_inos; + return 0; +out_free_inos: + mutex_destroy(&qinf->qi_quotaofflock); + mutex_destroy(&qinf->qi_tree_lock); + xfs_qm_destroy_quotainos(qinf); out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: @@ -706,7 +713,6 @@ out_free_qinf: return error; } - /* * Gets called when unmounting a filesystem or when all quotas get * turned off. @@ -723,19 +729,8 @@ xfs_qm_destroy_quotainfo( unregister_shrinker(&qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); - - if (qi->qi_uquotaip) { - IRELE(qi->qi_uquotaip); - qi->qi_uquotaip = NULL; /* paranoia */ - } - if (qi->qi_gquotaip) { - IRELE(qi->qi_gquotaip); - qi->qi_gquotaip = NULL; - } - if (qi->qi_pquotaip) { - IRELE(qi->qi_pquotaip); - qi->qi_pquotaip = NULL; - } + xfs_qm_destroy_quotainos(qi); + mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); kmem_free(qi); mp->m_quotainfo = NULL; @@ -793,8 +788,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, + &committed); if (error) { xfs_trans_cancel(tp); return error; @@ -847,6 +842,7 @@ xfs_qm_reset_dqcounts( { struct xfs_dqblk *dqb; int j; + xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -868,10 +864,13 @@ xfs_qm_reset_dqcounts( /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to - * find uninitialised dquot blks. See comment in xfs_dqcheck. + * find uninitialised dquot blks. See comment in + * xfs_dquot_verify. */ - xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, - "xfs_quotacheck"); + fa = xfs_dquot_verify(mp, ddq, id + j, type, 0); + if (fa) + xfs_dquot_repair(mp, ddq, id + j, type); + /* * Reset type in case we are reusing group quota file for * project quotas or vice versa @@ -1078,8 +1077,7 @@ xfs_qm_quotacheck_dqadjust( struct xfs_dquot *dqp; int error; - error = xfs_qm_dqget(mp, ip, id, type, - XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. @@ -1600,6 +1598,24 @@ error_rele: } STATIC void +xfs_qm_destroy_quotainos( + xfs_quotainfo_t *qi) +{ + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } +} + +STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { @@ -1682,8 +1698,7 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, NULL, uid, XFS_DQ_USER, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, + XFS_QMOPT_DQALLOC, &uq); if (error) { ASSERT(error != -ENOENT); @@ -1709,8 +1724,7 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, NULL, gid, XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, + XFS_QMOPT_DQALLOC, &gq); if (error) { ASSERT(error != -ENOENT); @@ -1729,8 +1743,7 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, + XFS_QMOPT_DQALLOC, &pq); if (error) { ASSERT(error != -ENOENT); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8f2e2fac4255..3a55d6fc271b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -393,7 +393,8 @@ xfs_cud_init( int xfs_cui_recover( struct xfs_mount *mp, - struct xfs_cui_log_item *cuip) + struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops) { int i; int error = 0; @@ -405,11 +406,9 @@ xfs_cui_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; enum xfs_refcount_intent_type type; - xfs_fsblock_t firstfsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; struct xfs_bmbt_irec irec; - struct xfs_defer_ops dfops; bool requeue_only = false; ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); @@ -465,7 +464,6 @@ xfs_cui_recover( return error; cudp = xfs_trans_get_cud(tp, cuip); - xfs_defer_init(&dfops, &firstfsb); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { refc = &cuip->cui_format.cui_extents[i]; refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; @@ -485,7 +483,7 @@ xfs_cui_recover( new_len = refc->pe_len; } else error = xfs_trans_log_finish_refcount_update(tp, cudp, - &dfops, type, refc->pe_startblock, refc->pe_len, + dfops, type, refc->pe_startblock, refc->pe_len, &new_fsb, &new_len, &rcur); if (error) goto abort_error; @@ -497,21 +495,21 @@ xfs_cui_recover( switch (type) { case XFS_REFCOUNT_INCREASE: error = xfs_refcount_increase_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_decrease_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_ALLOC_COW: error = xfs_refcount_alloc_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: error = xfs_refcount_free_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; @@ -525,17 +523,12 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); error = xfs_trans_commit(tp); return error; abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); -abort_defer: - xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index 5b74dddfa64b..0e5327349a13 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, struct xfs_cui_log_item *); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops); #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cc041a29eb70..270246943a06 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -49,8 +49,6 @@ #include "xfs_alloc.h" #include "xfs_quota_defs.h" #include "xfs_quota.h" -#include "xfs_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_iomap.h" #include "xfs_rmap_btree.h" @@ -456,6 +454,8 @@ retry: if (error) goto out_bmap_cancel; + xfs_inode_set_cowblocks_tag(ip); + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops); if (error) @@ -464,6 +464,13 @@ retry: error = xfs_trans_commit(tp); if (error) return error; + + /* + * Allocation succeeded but the requested range was not even partially + * satisfied? Bail out! + */ + if (nimaps == 0) + return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb, &dfops); @@ -492,8 +499,9 @@ xfs_reflink_find_cow_mapping( struct xfs_iext_cursor icur; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!xfs_is_reflink_inode(ip)) + return false; offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return false; @@ -598,10 +606,6 @@ xfs_reflink_cancel_cow_blocks( del.br_startblock, del.br_blockcount, NULL); - /* Update quota accounting */ - xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, - -(long)del.br_blockcount); - /* Roll the transaction */ xfs_defer_ijoin(&dfops, ip); error = xfs_defer_finish(tpp, &dfops); @@ -612,6 +616,16 @@ xfs_reflink_cancel_cow_blocks( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); + + /* Remove the quota reservation */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -(long)del.br_blockcount, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + break; + } else { + /* Didn't do anything, push cursor back. */ + xfs_iext_prev(ifp, &icur); } next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) @@ -727,7 +741,7 @@ xfs_reflink_end_cow( (unsigned int)(end_fsb - offset_fsb), XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, 0, &tp); + resblks, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; @@ -791,6 +805,10 @@ xfs_reflink_end_cow( if (error) goto out_defer; + /* Charge this new data fork mapping to the on-disk quota. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, + (long)del.br_blockcount); + /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); @@ -940,7 +958,7 @@ xfs_reflink_set_inode_flag( if (src->i_ino == dest->i_ino) xfs_ilock(src, XFS_ILOCK_EXCL); else - xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); if (!xfs_is_reflink_inode(src)) { trace_xfs_reflink_set_inode_flag(src); @@ -1198,13 +1216,16 @@ xfs_reflink_remap_blocks( /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { + uint lock_mode; + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, dest, destoff); + /* Read extent from the source file */ nimaps = 1; - xfs_ilock(src, XFS_ILOCK_EXCL); + lock_mode = xfs_ilock_data_map_shared(src); error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); - xfs_iunlock(src, XFS_ILOCK_EXCL); + xfs_iunlock(src, lock_mode); if (error) goto err; ASSERT(nimaps == 1); @@ -1241,6 +1262,50 @@ err: } /* + * Grab the exclusive iolock for a data copy from src to dest, making + * sure to abide vfs locking order (lowest pointer value goes first) and + * breaking the pnfs layout leases on dest before proceeding. The loop + * is needed because we cannot call the blocking break_layout() with the + * src iolock held, and therefore have to back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + +retry: + if (src < dest) { + inode_lock_shared(src); + inode_lock_nested(dest, I_MUTEX_NONDIR2); + } else { + /* src >= dest */ + inode_lock(dest); + } + + error = break_layout(dest, false); + if (error == -EWOULDBLOCK) { + inode_unlock(dest); + if (src < dest) + inode_unlock_shared(src); + error = break_layout(dest, true); + if (error) + return error; + goto retry; + } + if (error) { + inode_unlock(dest); + if (src < dest) + inode_unlock_shared(src); + return error; + } + if (src > dest) + inode_lock_shared_nested(src, I_MUTEX_NONDIR2); + return 0; +} + +/* * Link a range of blocks from one file to another. */ int @@ -1270,11 +1335,14 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - lock_two_nondirectories(inode_in, inode_out); + ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); + if (ret) + return ret; if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); else - xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, + XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; @@ -1291,8 +1359,24 @@ xfs_reflink_remap_range( if (ret <= 0) goto out_unlock; + /* Attach dquots to dest inode before changing block map */ + ret = xfs_qm_dqattach(dest, 0); + if (ret) + goto out_unlock; + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + /* + * Clear out post-eof preallocations because we don't have page cache + * backing the delayed allocations and they'll never get freed on + * their own. + */ + if (xfs_can_free_eofblocks(dest, true)) { + ret = xfs_free_eofblocks(dest); + if (ret) + goto out_unlock; + } + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) @@ -1326,10 +1410,12 @@ xfs_reflink_remap_range( is_dedupe); out_unlock: - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + inode_unlock(inode_out); if (!same_inode) - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - unlock_two_nondirectories(inode_in, inode_out); + inode_unlock_shared(inode_in); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 3f30f846d7f2..dfee3c991155 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -139,6 +139,9 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); +int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + bool *is_free); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -148,6 +151,7 @@ bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); # define xfs_rtalloc_query_all(t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) +# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f663022353c0..f3e0001f9992 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -212,9 +212,9 @@ xfs_parseargs( */ if (sb_rdonly(sb)) mp->m_flags |= XFS_MOUNT_RDONLY; - if (sb->s_flags & MS_DIRSYNC) + if (sb->s_flags & SB_DIRSYNC) mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) mp->m_flags |= XFS_MOUNT_WSYNC; /* @@ -1153,6 +1153,14 @@ xfs_fs_statfs( ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); + + if (XFS_IS_REALTIME_MOUNT(mp) && + (ip->i_d.di_flags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + statp->f_blocks = sbp->sb_rblocks; + statp->f_bavail = statp->f_bfree = + sbp->sb_frextents * sbp->sb_rextsize; + } + return 0; } @@ -1312,7 +1320,7 @@ xfs_fs_remount( } /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) { if (mp->m_flags & XFS_MOUNT_NORECOVERY) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); @@ -1360,6 +1368,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } + xfs_queue_cowblocks(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1368,7 +1377,15 @@ xfs_fs_remount( } /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { + /* Get rid of any leftover CoW reservations... */ + cancel_delayed_work_sync(&mp->m_cowblocks_work); + error = xfs_icache_free_cowblocks(mp, NULL); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { @@ -1651,7 +1668,7 @@ xfs_fs_fill_super( } if (xfs_sb_version_hasreflink(&mp->m_sb)) xfs_alert(mp, - "DAX and reflink have not been tested together!"); + "DAX and reflink cannot be used together!"); } if (mp->m_flags & XFS_MOUNT_DISCARD) { @@ -1675,10 +1692,6 @@ xfs_fs_fill_super( "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); } - if (xfs_sb_version_hasreflink(&mp->m_sb)) - xfs_alert(mp, - "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 5f2f32408011..fcc5dfc70aa0 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -30,7 +30,7 @@ extern void xfs_qm_exit(void); #ifdef CONFIG_XFS_POSIX_ACL # define XFS_ACL_STRING "ACLs, " -# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +# define set_posix_acl_flag(sb) ((sb)->s_flags |= SB_POSIXACL) #else # define XFS_ACL_STRING # define set_posix_acl_flag(sb) do { } while (0) diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 68d3ca2c4968..2e9e793a8f9d 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -232,11 +232,6 @@ xfs_symlink( resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); - if (error == -ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, - &tp); - } if (error) goto out_release_inode; @@ -260,14 +255,6 @@ xfs_symlink( goto out_trans_cancel; /* - * Check for ability to enter directory entry, if no space reserved. - */ - if (!resblks) { - error = xfs_dir_canenter(tp, dp, link_name); - if (error) - goto out_trans_cancel; - } - /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ @@ -277,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); + prid, &ip, NULL); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 5d95fe348294..35f3546b6af5 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -24,7 +24,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d718a10c2271..945de08af7ba 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, __entry->flags = ctx->flags; ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist 0x%p size %u count %u firstu %u flags %d %s", + "alist %p size %u count %u firstu %u flags %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->hashval, @@ -119,7 +119,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %ps", + TP_printk("dev %d:%d agno %u refcount %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -200,7 +200,7 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->bt_before = be32_to_cpu(btree->before); ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist 0x%p size %u count %u firstu %u flags %d %s " + "alist %p size %u count %u firstu %u flags %d %s " "node hashval %u, node before %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -251,8 +251,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->bmap_state = state; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d " - "offset %lld block %lld count %lld flag %d caller %ps", + TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d " + "offset %lld block %lld count %lld flag %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -301,7 +301,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " - "lock %d flags %s caller %ps", + "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -370,7 +370,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d flags %s caller %ps", + "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -390,7 +390,7 @@ DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); TRACE_EVENT(xfs_buf_ioerror, - TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip), TP_ARGS(bp, error, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) @@ -401,7 +401,7 @@ TRACE_EVENT(xfs_buf_ioerror, __field(int, pincount) __field(unsigned, lockval) __field(int, error) - __field(unsigned long, caller_ip) + __field(xfs_failaddr_t, caller_ip) ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; @@ -415,7 +415,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d error %d flags %s caller %ps", + "lock %d error %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -460,7 +460,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " - "lidesc 0x%p liflags %s", + "lidesc %p liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->buf_bno, __entry->buf_len, @@ -579,7 +579,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class, __entry->lock_flags = lock_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), @@ -697,7 +697,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, @@ -1028,7 +1028,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __entry->flags = lip->li_flags; __entry->lsn = lip->li_lsn; ), - TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + TP_printk("dev %d:%d lip %p lsn %d/%d type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lip, CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), @@ -1049,7 +1049,7 @@ TRACE_EVENT(xfs_log_force, __entry->lsn = lsn; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d lsn 0x%llx caller %ps", + TP_printk("dev %d:%d lsn 0x%llx caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lsn, (void *)__entry->caller_ip) ) @@ -1082,7 +1082,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __entry->old_lsn = old_lsn; __entry->new_lsn = new_lsn; ), - TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + TP_printk("dev %d:%d lip %p old lsn %d/%d new lsn %d/%d type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lip, CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), @@ -1403,7 +1403,7 @@ TRACE_EVENT(xfs_bunmap, __entry->flags = flags; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" - "flags %s caller %ps", + "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1517,7 +1517,7 @@ TRACE_EVENT(xfs_agf, ), TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " - "freeblks %u longest %u caller %ps", + "freeblks %u longest %u caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), @@ -2014,7 +2014,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __entry->count = item->ri_cnt; __entry->total = item->ri_total; ), - TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " + TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item %p, " "item type %s item region count/total %d/%d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, @@ -2486,7 +2486,7 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u error %d caller %ps", + TP_printk("dev %d:%d agno %u error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->error, @@ -2977,7 +2977,7 @@ DECLARE_EVENT_CLASS(xfs_inode_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino %llx error %d caller %ps", + TP_printk("dev %d:%d ino %llx error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->error, @@ -3313,6 +3313,32 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); +TRACE_EVENT(xfs_trans_resv_calc, + TP_PROTO(struct xfs_mount *mp, unsigned int type, + struct xfs_trans_res *res), + TP_ARGS(mp, type, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(uint, logres) + __field(int, logcount) + __field(int, logflags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->type = type; + __entry->logres = res->tr_logres; + __entry->logcount = res->tr_logcount; + __entry->logflags = res->tr_logflags; + ), + TP_printk("dev %d:%d type %d logres %u logcount %d flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->logres, + __entry->logcount, + __entry->logflags) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a87f657f59c9..86f92df32c42 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -35,6 +35,27 @@ kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; +#if defined(CONFIG_TRACEPOINTS) +static void +xfs_trans_trace_reservations( + struct xfs_mount *mp) +{ + struct xfs_trans_res resv; + struct xfs_trans_res *res; + struct xfs_trans_res *end_res; + int i; + + res = (struct xfs_trans_res *)M_RES(mp); + end_res = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (i = 0; res < end_res; i++, res++) + trace_xfs_trans_resv_calc(mp, i, res); + xfs_log_get_max_trans_res(mp, &resv); + trace_xfs_trans_resv_calc(mp, -1, &resv); +} +#else +# define xfs_trans_trace_reservations(mp) +#endif + /* * Initialize the precomputed transaction reservation values * in the mount structure. @@ -44,6 +65,7 @@ xfs_trans_init( struct xfs_mount *mp) { xfs_trans_resv_calc(mp, M_RES(mp)); + xfs_trans_trace_reservations(mp); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 815b53d20e26..9d542dfe0052 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -50,7 +50,7 @@ typedef struct xfs_log_item { uint li_type; /* item type */ uint li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ - struct xfs_log_item *li_bio_list; /* buffer item list */ + struct list_head li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); /* buffer item iodone */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3ba7a96a8abd..653ce379d36b 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -82,12 +82,12 @@ _xfs_trans_bjoin( ASSERT(bp->b_transp == NULL); /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * The xfs_buf_log_item pointer is stored in b_log_item. If * it doesn't have one yet, then allocate one and initialize it. * The checks to see if one is there are in xfs_buf_item_init(). */ xfs_buf_item_init(bp, tp->t_mountp); - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); @@ -118,7 +118,7 @@ xfs_trans_bjoin( struct xfs_buf *bp) { _xfs_trans_bjoin(tp, bp, 0); - trace_xfs_trans_bjoin(bp->b_fspriv); + trace_xfs_trans_bjoin(bp->b_log_item); } /* @@ -139,7 +139,7 @@ xfs_trans_get_buf_map( xfs_buf_flags_t flags) { xfs_buf_t *bp; - xfs_buf_log_item_t *bip; + struct xfs_buf_log_item *bip; if (!tp) return xfs_buf_get_map(target, map, nmaps, flags); @@ -159,7 +159,7 @@ xfs_trans_get_buf_map( } ASSERT(bp->b_transp == tp); - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; @@ -175,7 +175,7 @@ xfs_trans_get_buf_map( ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_get_buf(bp->b_fspriv); + trace_xfs_trans_get_buf(bp->b_log_item); return bp; } @@ -188,12 +188,13 @@ xfs_trans_get_buf_map( * mount structure. */ xfs_buf_t * -xfs_trans_getsb(xfs_trans_t *tp, - struct xfs_mount *mp, - int flags) +xfs_trans_getsb( + xfs_trans_t *tp, + struct xfs_mount *mp, + int flags) { xfs_buf_t *bp; - xfs_buf_log_item_t *bip; + struct xfs_buf_log_item *bip; /* * Default to just trying to lock the superblock buffer @@ -210,7 +211,7 @@ xfs_trans_getsb(xfs_trans_t *tp, */ bp = mp->m_sb_bp; if (bp->b_transp == tp) { - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; @@ -223,7 +224,7 @@ xfs_trans_getsb(xfs_trans_t *tp, return NULL; _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_getsb(bp->b_fspriv); + trace_xfs_trans_getsb(bp->b_log_item); return bp; } @@ -266,7 +267,7 @@ xfs_trans_read_buf_map( if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); - ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_log_item != NULL); ASSERT(!bp->b_error); ASSERT(bp->b_flags & XBF_DONE); @@ -279,7 +280,7 @@ xfs_trans_read_buf_map( return -EIO; } - bip = bp->b_fspriv; + bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -329,7 +330,7 @@ xfs_trans_read_buf_map( if (tp) { _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_read_buf(bp->b_fspriv); + trace_xfs_trans_read_buf(bp->b_log_item); } *bpp = bp; return 0; @@ -352,10 +353,11 @@ xfs_trans_read_buf_map( * brelse() call. */ void -xfs_trans_brelse(xfs_trans_t *tp, - xfs_buf_t *bp) +xfs_trans_brelse( + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + struct xfs_buf_log_item *bip; int freed; /* @@ -368,7 +370,7 @@ xfs_trans_brelse(xfs_trans_t *tp, } ASSERT(bp->b_transp == tp); - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); @@ -456,10 +458,11 @@ xfs_trans_brelse(xfs_trans_t *tp, */ /* ARGSUSED */ void -xfs_trans_bhold(xfs_trans_t *tp, - xfs_buf_t *bp) +xfs_trans_bhold( + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -476,10 +479,11 @@ xfs_trans_bhold(xfs_trans_t *tp, * for this transaction. */ void -xfs_trans_bhold_release(xfs_trans_t *tp, - xfs_buf_t *bp) +xfs_trans_bhold_release( + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -500,7 +504,7 @@ xfs_trans_dirty_buf( struct xfs_trans *tp, struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -557,7 +561,7 @@ xfs_trans_log_buf( uint first, uint last) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); @@ -600,10 +604,10 @@ xfs_trans_log_buf( */ void xfs_trans_binval( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; int i; ASSERT(bp->b_transp == tp); @@ -655,10 +659,10 @@ xfs_trans_binval( */ void xfs_trans_inode_buf( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -679,10 +683,10 @@ xfs_trans_inode_buf( */ void xfs_trans_stale_inode_buf( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -704,10 +708,10 @@ xfs_trans_stale_inode_buf( /* ARGSUSED */ void xfs_trans_inode_alloc_buf( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -729,7 +733,7 @@ xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -759,7 +763,7 @@ xfs_trans_buf_set_type( struct xfs_buf *bp, enum xfs_blft type) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; if (!tp) return; @@ -776,8 +780,8 @@ xfs_trans_buf_copy_type( struct xfs_buf *dst_bp, struct xfs_buf *src_bp) { - struct xfs_buf_log_item *sbip = src_bp->b_fspriv; - struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + struct xfs_buf_log_item *sbip = src_bp->b_log_item; + struct xfs_buf_log_item *dbip = dst_bp->b_log_item; enum xfs_blft type; type = xfs_blft_from_flags(&sbip->__bli_format); @@ -797,11 +801,11 @@ xfs_trans_buf_copy_type( /* ARGSUSED */ void xfs_trans_dquot_buf( - xfs_trans_t *tp, - xfs_buf_t *bp, - uint type) + xfs_trans_t *tp, + xfs_buf_t *bp, + uint type) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index daa7615497f9..4a89da4b6fe7 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -28,6 +28,8 @@ #include "xfs_inode_item.h" #include "xfs_trace.h" +#include <linux/iversion.h> + /* * Add a locked inode to the transaction. * @@ -110,15 +112,17 @@ xfs_trans_log_inode( /* * First time we log the inode in a transaction, bump the inode change - * counter if it is configured for this to occur. We don't use - * inode_inc_version() because there is no need for extra locking around - * i_version as we already hold the inode locked exclusively for - * metadata modification. + * counter if it is configured for this to occur. While we have the + * inode locked exclusively for metadata modification, we can usually + * avoid setting XFS_ILOG_CORE if no one has queried the value since + * the last time it was incremented. If we have XFS_ILOG_CORE already + * set however, then go ahead and bump the i_version counter + * unconditionally. */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - VFS_I(ip)->i_version++; - flags |= XFS_ILOG_CORE; + if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) + flags |= XFS_ILOG_CORE; } tp->t_flags |= XFS_TRANS_DIRTY; |