diff options
Diffstat (limited to 'fs/xfs')
43 files changed, 1081 insertions, 576 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9b373a0c7aaf..e9cc481b4ddf 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -984,7 +984,10 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true); + err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + XFS_AG_RESV_NONE, true); + if (err2) + goto resv_err; /* * Roll the transaction before trying to re-init the per-ag diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fdfa08cbf4db..3069194527dd 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -628,6 +628,25 @@ xfs_alloc_fixup_trees( return 0; } +/* + * We do not verify the AGFL contents against AGF-based index counters here, + * even though we may have access to the perag that contains shadow copies. We + * don't know if the AGF based counters have been checked, and if they have they + * still may be inconsistent because they haven't yet been reset on the first + * allocation after the AGF has been read in. + * + * This means we can only check that all agfl entries contain valid or null + * values because we can't reliably determine the active range to exclude + * NULLAGBNO as a valid value. + * + * However, we can't even do that for v4 format filesystems because there are + * old versions of mkfs out there that does not initialise the AGFL to known, + * verifiable values. HEnce we can't tell the difference between a AGFL block + * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. + * + * As a result, we can only fully validate AGFL block numbers when we pull them + * from the freelist in xfs_alloc_get_freelist(). + */ static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) @@ -637,12 +656,6 @@ xfs_agfl_verify( __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; - /* - * There is no verification of non-crc AGFLs because mkfs does not - * initialise the AGFL to zero or NULL. Hence the only valid part of the - * AGFL is what the AGF says is active. We can't get to the AGF, so we - * can't verify just those entries are valid. - */ if (!xfs_has_crc(mp)) return NULL; @@ -1523,7 +1536,8 @@ xfs_alloc_ag_vextent_lastblock( */ STATIC int xfs_alloc_ag_vextent_near( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { struct xfs_alloc_cur acur = {}; int error; /* error code */ @@ -1542,6 +1556,8 @@ xfs_alloc_ag_vextent_near( if (args->agbno > args->max_agbno) args->agbno = args->max_agbno; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: len = 0; @@ -1597,9 +1613,20 @@ restart: */ if (!acur.len) { if (acur.busy) { + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, - acur.busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + acur.busy_gen, alloc_flags); + if (error) + goto out; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } trace_xfs_alloc_size_neither(args); @@ -1622,22 +1649,25 @@ out: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +static int xfs_alloc_ag_vextent_size( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { - struct xfs_agf *agf = args->agbp->b_addr; - struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ - struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ - int error; /* error result */ - xfs_agblock_t fbno; /* start of found freespace */ - xfs_extlen_t flen; /* length of found freespace */ - int i; /* temp status variable */ - xfs_agblock_t rbno; /* returned block number */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; + struct xfs_agf *agf = args->agbp->b_addr; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + bool busy; + unsigned busy_gen; + int error; + int i; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: /* * Allocate and initialize a cursor for the by-size btree. @@ -1695,19 +1725,25 @@ restart: error = xfs_btree_increment(cnt_cur, 0, &i); if (error) goto error0; - if (i == 0) { - /* - * Our only valid extents must have been busy. - * Make it unbusy by forcing the log out and - * retrying. - */ - xfs_btree_del_cursor(cnt_cur, - XFS_BTREE_NOERROR); - trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, - args->pag, busy_gen); - goto restart; - } + if (i) + continue; + + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ + trace_xfs_alloc_size_busy(args); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + goto restart; } } @@ -1787,9 +1823,21 @@ restart: args->len = rlen; if (rlen < args->minlen) { if (busy) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); goto restart; } goto out_nominleft; @@ -2321,12 +2369,16 @@ xfs_free_agfl_block( } /* - * Check the agfl fields of the agf for inconsistency or corruption. The purpose - * is to detect an agfl header padding mismatch between current and early v5 - * kernels. This problem manifests as a 1-slot size difference between the - * on-disk flcount and the active [first, last] range of a wrapped agfl. This - * may also catch variants of agfl count corruption unrelated to padding. Either - * way, we'll reset the agfl and warn the user. + * Check the agfl fields of the agf for inconsistency or corruption. + * + * The original purpose was to detect an agfl header padding mismatch between + * current and early v5 kernels. This problem manifests as a 1-slot size + * difference between the on-disk flcount and the active [first, last] range of + * a wrapped agfl. + * + * However, we need to use these same checks to catch agfl count corruptions + * unrelated to padding. This could occur on any v4 or v5 filesystem, so either + * way, we need to reset the agfl and warn the user. * * Return true if a reset is required before the agfl can be used, false * otherwise. @@ -2342,10 +2394,6 @@ xfs_agfl_needs_reset( int agfl_size = xfs_agfl_size(mp); int active; - /* no agfl header on v4 supers */ - if (!xfs_has_crc(mp)) - return false; - /* * The agf read verifier catches severe corruption of these fields. * Repeat some sanity checks to cover a packed -> unpacked mismatch if @@ -2418,41 +2466,48 @@ xfs_agfl_reset( * the real allocation can proceed. Deferring the free disconnects freeing up * the AGFL slot from freeing the block. */ -STATIC void +static int xfs_defer_agfl_block( struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_fsblock_t agbno, + xfs_agblock_t agbno, struct xfs_owner_info *oinfo) { struct xfs_mount *mp = tp->t_mountp; struct xfs_extent_free_item *xefi; + xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) + return -EFSCORRUPTED; + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + xefi->xefi_startblock = fsbno; xefi->xefi_blockcount = 1; xefi->xefi_owner = oinfo->oi_owner; + xefi->xefi_agresv = XFS_AG_RESV_AGFL; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + return 0; } /* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -void +int __xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, bool skip_discard) { struct xfs_extent_free_item *xefi; @@ -2473,11 +2528,16 @@ __xfs_free_extent_later( ASSERT(agbno + len <= mp->m_sb.sb_agblocks); #endif ASSERT(xfs_extfree_item_cache != NULL); + ASSERT(type != XFS_AG_RESV_AGFL); + + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; xefi->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_agresv = type; if (skip_discard) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { @@ -2497,6 +2557,7 @@ __xfs_free_extent_later( xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + return 0; } #ifdef DEBUG @@ -2547,7 +2608,7 @@ out: int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; struct xfs_perag *pag = args->pag; @@ -2563,7 +2624,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!xfs_perag_initialised_agf(pag)) { - error = xfs_alloc_read_agf(pag, tp, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2579,13 +2640,13 @@ xfs_alloc_fix_freelist( */ if (xfs_perag_prefers_metadata(pag) && (args->datatype & XFS_ALLOC_USERDATA) && - (flags & XFS_ALLOC_FLAG_TRYLOCK)) { - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags | + if (!xfs_alloc_space_available(args, need, alloc_flags | XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; @@ -2594,7 +2655,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(pag, tp, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2609,7 +2670,7 @@ xfs_alloc_fix_freelist( /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; #ifdef DEBUG @@ -2647,17 +2708,20 @@ xfs_alloc_fix_freelist( */ memset(&targs, 0, sizeof(targs)); /* struct copy below */ - if (flags & XFS_ALLOC_FLAG_NORMAP) + if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; else targs.oinfo = XFS_RMAP_OINFO_AG; - while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { + while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && + pag->pagf_flcount > need) { error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; /* defer agfl frees */ - xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + if (error) + goto out_agbp_relse; } targs.tp = tp; @@ -2677,7 +2741,7 @@ xfs_alloc_fix_freelist( targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ - error = xfs_alloc_ag_vextent_size(&targs); + error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); if (error) goto out_agflbp_relse; @@ -2687,7 +2751,7 @@ xfs_alloc_fix_freelist( * on a completely full ag. */ if (targs.agbno == NULLAGBLOCK) { - if (flags & XFS_ALLOC_FLAG_FREEING) + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) break; goto out_agflbp_relse; } @@ -2767,6 +2831,9 @@ xfs_alloc_get_freelist( */ agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) + return -EFSCORRUPTED; + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) @@ -2889,12 +2956,69 @@ xfs_alloc_put_freelist( return 0; } +/* + * Check that this AGF/AGI header's sequence number and length matches the AG + * number and size in fsblocks. + */ +xfs_failaddr_t +xfs_validate_ag_length( + struct xfs_buf *bp, + uint32_t seqno, + uint32_t length) +{ + struct xfs_mount *mp = bp->b_mount; + /* + * During growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && seqno != bp->b_pag->pag_agno) + return __this_address; + + /* + * Only the last AG in the filesystem is allowed to be shorter + * than the AG size recorded in the superblock. + */ + if (length != mp->m_sb.sb_agblocks) { + /* + * During growfs, the new last AG can get here before we + * have updated the superblock. Give it a pass on the seqno + * check. + */ + if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) + return __this_address; + if (length < XFS_MIN_AG_BLOCKS) + return __this_address; + if (length > mp->m_sb.sb_agblocks) + return __this_address; + } + + return NULL; +} + +/* + * Verify the AGF is consistent. + * + * We do not verify the AGFL indexes in the AGF are fully consistent here + * because of issues with variable on-disk structure sizes. Instead, we check + * the agfl indexes for consistency when we initialise the perag from the AGF + * information after a read completes. + * + * If the index is inconsistent, then we mark the perag as needing an AGFL + * reset. The first AGFL update performed then resets the AGFL indexes and + * refills the AGFL with known good free blocks, allowing the filesystem to + * continue operating normally at the cost of a few leaked free space blocks. + */ static xfs_failaddr_t xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_agf *agf = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); + uint32_t agf_length = be32_to_cpu(agf->agf_length); if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2906,18 +3030,26 @@ xfs_agf_verify( if (!xfs_verify_magic(bp, agf->agf_magicnum)) return __this_address; - if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) + if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) return __this_address; - if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + /* + * Both agf_seqno and agf_length need to validated before anything else + * block number related in the AGF or AGFL can be checked. + */ + fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); + if (fa) + return fa; + + if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) return __this_address; if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || - be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || @@ -2928,41 +3060,30 @@ xfs_agf_verify( mp->m_alloc_maxlevels) return __this_address; - if (xfs_has_rmapbt(mp) && - (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > - mp->m_rmap_maxlevels)) - return __this_address; - - if (xfs_has_rmapbt(mp) && - be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + if (xfs_has_lazysbcount(mp) && + be32_to_cpu(agf->agf_btreeblks) > agf_length) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) - return __this_address; + if (xfs_has_rmapbt(mp)) { + if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) + return __this_address; - if (xfs_has_lazysbcount(mp) && - be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) - return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > + mp->m_rmap_maxlevels) + return __this_address; + } - if (xfs_has_reflink(mp) && - be32_to_cpu(agf->agf_refcount_blocks) > - be32_to_cpu(agf->agf_length)) - return __this_address; + if (xfs_has_reflink(mp)) { + if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) + return __this_address; - if (xfs_has_reflink(mp) && - (be32_to_cpu(agf->agf_refcount_level) < 1 || - be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) - return __this_address; + if (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) + return __this_address; + } return NULL; - } static void @@ -3187,7 +3308,8 @@ xfs_alloc_vextent_check_args( */ static int xfs_alloc_vextent_prepare_ag( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { bool need_pag = !args->pag; int error; @@ -3196,7 +3318,7 @@ xfs_alloc_vextent_prepare_ag( args->pag = xfs_perag_get(args->mp, args->agno); args->agbp = NULL; - error = xfs_alloc_fix_freelist(args, 0); + error = xfs_alloc_fix_freelist(args, alloc_flags); if (error) { trace_xfs_alloc_vextent_nofix(args); if (need_pag) @@ -3318,6 +3440,7 @@ xfs_alloc_vextent_this_ag( { struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; + uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); @@ -3336,9 +3459,9 @@ xfs_alloc_vextent_this_ag( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (!error && args->agbp) - error = xfs_alloc_ag_vextent_size(args); + error = xfs_alloc_ag_vextent_size(args, alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, false); } @@ -3367,20 +3490,20 @@ xfs_alloc_vextent_iterate_ags( xfs_agnumber_t minimum_agno, xfs_agnumber_t start_agno, xfs_agblock_t target_agbno, - uint32_t flags) + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; xfs_agnumber_t restart_agno = minimum_agno; xfs_agnumber_t agno; int error = 0; - if (flags & XFS_ALLOC_FLAG_TRYLOCK) + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) restart_agno = 0; restart: for_each_perag_wrap_range(mp, start_agno, restart_agno, mp->m_sb.sb_agcount, agno, args->pag) { args->agno = agno; - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (error) break; if (!args->agbp) { @@ -3394,10 +3517,10 @@ restart: */ if (args->agno == start_agno && target_agbno) { args->agbno = target_agbno; - error = xfs_alloc_ag_vextent_near(args); + error = xfs_alloc_ag_vextent_near(args, alloc_flags); } else { args->agbno = 0; - error = xfs_alloc_ag_vextent_size(args); + error = xfs_alloc_ag_vextent_size(args, alloc_flags); } break; } @@ -3414,8 +3537,8 @@ restart: * constraining flags by the caller, drop them and retry the allocation * without any constraints being set. */ - if (flags) { - flags = 0; + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { + alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; restart_agno = minimum_agno; goto restart; } @@ -3443,6 +3566,7 @@ xfs_alloc_vextent_start_ag( xfs_agnumber_t start_agno; xfs_agnumber_t rotorstep = xfs_rotorstep; bool bump_rotor = false; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; int error; ASSERT(args->pag == NULL); @@ -3469,7 +3593,7 @@ xfs_alloc_vextent_start_ag( start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, - XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); if (bump_rotor) { if (args->agno == start_agno) @@ -3496,6 +3620,7 @@ xfs_alloc_vextent_first_ag( struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; xfs_agnumber_t start_agno; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; int error; ASSERT(args->pag == NULL); @@ -3514,7 +3639,7 @@ xfs_alloc_vextent_first_ag( start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, - XFS_FSB_TO_AGBNO(mp, target), 0); + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, true); } @@ -3546,7 +3671,7 @@ xfs_alloc_vextent_exact_bno( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_exact(args); @@ -3567,6 +3692,7 @@ xfs_alloc_vextent_near_bno( struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; bool needs_perag = args->pag == NULL; + uint32_t alloc_flags = 0; int error; if (!needs_perag) @@ -3587,9 +3713,9 @@ xfs_alloc_vextent_near_bno( if (needs_perag) args->pag = xfs_perag_grab(mp, args->agno); - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (!error && args->agbp) - error = xfs_alloc_ag_vextent_near(args); + error = xfs_alloc_ag_vextent_near(args, alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); } @@ -3717,15 +3843,11 @@ xfs_alloc_query_range( xfs_alloc_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_alloc_query_range_info query; + union xfs_btree_irec low_brec = { .a = *low_rec }; + union xfs_btree_irec high_brec = { .a = *high_rec }; + struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); - low_brec.a = *low_rec; - high_brec.a = *high_rec; - query.priv = priv; - query.fn = fn; return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 5dbb25546d0b..6bb8d295c321 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -19,11 +19,12 @@ unsigned int xfs_agfl_size(struct xfs_mount *mp); /* * Flags for xfs_alloc_fix_freelist. */ -#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ -#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ -#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ -#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ -#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING (1U << 1) /* indicate caller is freeing extents*/ +#define XFS_ALLOC_FLAG_NORMAP (1U << 2) /* don't modify the rmapbt */ +#define XFS_ALLOC_FLAG_NOSHRINK (1U << 3) /* don't shrink the freelist */ +#define XFS_ALLOC_FLAG_CHECK (1U << 4) /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYFLUSH (1U << 5) /* don't wait in busy extent flush */ /* * Argument structure for xfs_alloc routines. @@ -195,7 +196,7 @@ int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); -int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); @@ -230,9 +231,9 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, - bool skip_discard); + enum xfs_ag_resv_type type, bool skip_discard); /* * List of extents to be free "later". @@ -245,6 +246,7 @@ struct xfs_extent_free_item { xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ struct xfs_perag *xefi_pag; unsigned int xefi_flags; + enum xfs_ag_resv_type xefi_agresv; }; void xfs_extent_free_get_group(struct xfs_mount *mp, @@ -254,14 +256,15 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline void +static inline int xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - const struct xfs_owner_info *oinfo) + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - __xfs_free_extent_later(tp, bno, len, oinfo, false); + return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); } @@ -270,4 +273,7 @@ extern struct kmem_cache *xfs_extfree_item_cache; int __init xfs_extfree_intent_init_cache(void); void xfs_extfree_intent_destroy_cache(void); +xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf *bp, uint32_t seqno, + uint32_t length); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index beee51ad75ce..2580ae47209a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2293,8 +2293,6 @@ xfs_attr3_leaf_unbalance( trace_xfs_attr_leaf_unbalance(state->args); - drop_leaf = drop_blk->bp->b_addr; - save_leaf = save_blk->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index cd8870a16fd1..30c931b38853 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -572,8 +572,13 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, + XFS_AG_RESV_NONE); + if (error) + return error; + ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -5230,10 +5235,13 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - __xfs_free_extent_later(tp, del->br_startblock, + error = __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, - (bflags & XFS_BMAPI_NODISCARD) || - del->br_state == XFS_EXT_UNWRITTEN); + XFS_AG_RESV_NONE, + ((bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN)); + if (error) + goto done; } } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1b40e5f8b1ec..bf3f1b36fdd2 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -268,11 +268,15 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; + int error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); - ip->i_nblocks--; + error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_NONE); + if (error) + return error; + ip->i_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); return 0; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index a2aa36b23e25..4d68a58be160 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -301,7 +301,7 @@ struct xfs_btree_cur static inline size_t xfs_btree_cur_sizeof(unsigned int nlevels) { - return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels); + return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } /* cursor flags */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 1cfd5bc6520a..9c60ebb328b4 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -257,6 +257,8 @@ typedef struct xfs_fsop_resblks { #define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE) #define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE) +#define XFS_MAX_AGNUMBER ((xfs_agnumber_t)(NULLAGNUMBER - 1)) + /* keep the maximum size under 2^31 by a small amount */ #define XFS_MAX_LOG_BYTES \ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a16d5de16933..b83e54c70906 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1834,7 +1834,7 @@ retry: * might be sparse and only free the regions that are allocated as part of the * chunk. */ -STATIC void +static int xfs_difree_inode_chunk( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -1851,10 +1851,10 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); - return; + return xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, sagbno), + M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1871,6 +1871,8 @@ xfs_difree_inode_chunk( XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { + int error; + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* @@ -1896,8 +1898,11 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + error = xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + if (error) + return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1905,6 +1910,7 @@ xfs_difree_inode_chunk( next: nextbit++; } + return 0; } STATIC int @@ -2003,7 +2009,9 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + if (error) + goto error0; } else { xic->deleted = false; @@ -2478,11 +2486,14 @@ xfs_ialloc_log_agi( static xfs_failaddr_t xfs_agi_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = bp->b_addr; - int i; + struct xfs_mount *mp = bp->b_mount; + struct xfs_agi *agi = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); + uint32_t agi_length = be32_to_cpu(agi->agi_length); + int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2499,6 +2510,10 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; + fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); + if (fa) + return fa; + if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; @@ -2508,15 +2523,6 @@ xfs_agi_verify( be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) - return __this_address; - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 5a945ae21b5d..9258f01c0015 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -160,8 +160,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_INOBT, resv); } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f13e0809dc63..269573c82808 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ #define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ - /* * The timestamps are dirty, but not necessarily anything else in the inode * core. Unlike the other fields above this one must never make it to disk @@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_TIMESTAMP 0x4000 +/* + * The version field has been changed, but not necessarily anything else of + * interest. This must never make it to disk - it is used purely to ensure that + * the inode item ->precommit operation can update the fsync flag triggers + * in the inode item correctly. + */ +#define XFS_ILOG_IVERSION 0x8000 + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index c1c65774dcc2..646b3fa362ad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1151,8 +1151,11 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, tmp.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL); + error = xfs_free_extent_later(cur->bc_tp, fsbno, + tmp.rc_blockcount, NULL, + XFS_AG_RESV_NONE); + if (error) + goto out_error; } (*agbno) += tmp.rc_blockcount; @@ -1210,8 +1213,11 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, ext.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL); + error = xfs_free_extent_later(cur->bc_tp, fsbno, + ext.rc_blockcount, NULL, + XFS_AG_RESV_NONE); + if (error) + goto out_error; } skip: @@ -1915,8 +1921,13 @@ xfs_refcount_recover_cow_leftovers( struct xfs_buf *agbp; struct xfs_refcount_recovery *rr, *n; struct list_head debris; - union xfs_btree_irec low; - union xfs_btree_irec high; + union xfs_btree_irec low = { + .rc.rc_domain = XFS_REFC_DOMAIN_COW, + }; + union xfs_btree_irec high = { + .rc.rc_domain = XFS_REFC_DOMAIN_COW, + .rc.rc_startblock = -1U, + }; xfs_fsblock_t fsb; int error; @@ -1947,10 +1958,6 @@ xfs_refcount_recover_cow_leftovers( cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); /* Find all the leftover CoW staging extents. */ - memset(&low, 0, sizeof(low)); - memset(&high, 0, sizeof(high)); - low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; - high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); @@ -1976,7 +1983,11 @@ xfs_refcount_recover_cow_leftovers( rr->rr_rrec.rc_blockcount); /* Free the block. */ - xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); + error = xfs_free_extent_later(tp, fsb, + rr->rr_rrec.rc_blockcount, NULL, + XFS_AG_RESV_NONE); + if (error) + goto out_trans; error = xfs_trans_commit(tp); if (error) diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d4afc5f4e6a5..5c3987d8dc24 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -106,19 +106,13 @@ xfs_refcountbt_free_block( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); - if (error) - return error; - - return error; } STATIC int diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index f4dc23b3b837..fbb0b2637463 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2389,14 +2389,10 @@ xfs_rmap_query_range( xfs_rmap_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_rmap_query_range_info query; + union xfs_btree_irec low_brec = { .r = *low_rec }; + union xfs_btree_irec high_brec = { .r = *high_rec }; + struct xfs_rmap_query_range_info query = { .priv = priv, .fn = fn }; - low_brec.r = *low_rec; - high_brec.r = *high_rec; - query.priv = priv; - query.fn = fn; return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_rmap_query_range_helper, &query); } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ba0f17bc1dc0..5e174685a77c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -412,7 +412,6 @@ xfs_validate_sb_common( sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || - sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || @@ -430,6 +429,61 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* + * Logs that are too large are not supported at all. Reject them + * outright. Logs that are too small are tolerated on v4 filesystems, + * but we can only check that when mounting the log. Hence we skip + * those checks here. + */ + if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_notice(mp, + "Log size 0x%x blocks too large, maximum size is 0x%llx blocks", + sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS); + return -EFSCORRUPTED; + } + + if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size 0x%llx bytes too large, maximum size is 0x%llx bytes", + XFS_FSB_TO_B(mp, sbp->sb_logblocks), + XFS_MAX_LOG_BYTES); + return -EFSCORRUPTED; + } + + /* + * Do not allow filesystems with corrupted log sector or stripe units to + * be mounted. We cannot safely size the iclogs or write to the log if + * the log stripe unit is not valid. + */ + if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) { + if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) { + xfs_notice(mp, + "log sector size in bytes/log2 (0x%x/0x%x) must match", + sbp->sb_logsectsize, 1U << sbp->sb_logsectlog); + return -EFSCORRUPTED; + } + } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) { + xfs_notice(mp, + "log sector size in bytes/log2 (0x%x/0x%x) are not zero", + sbp->sb_logsectsize, sbp->sb_logsectlog); + return -EFSCORRUPTED; + } + + if (sbp->sb_logsunit > 1) { + if (sbp->sb_logsunit % sbp->sb_blocksize) { + xfs_notice(mp, + "log stripe unit 0x%x bytes must be a multiple of block size", + sbp->sb_logsunit); + return -EFSCORRUPTED; + } + if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) { + xfs_notice(mp, + "log stripe unit 0x%x bytes over maximum size (0x%x bytes)", + sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE); + return -EFSCORRUPTED; + } + } + /* Validate the realtime geometry; stolen from xfs_repair */ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 8b5547073379..cb4796b6e693 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -40,9 +40,8 @@ xfs_trans_ijoin( iip->ili_lock_flags = lock_flags; ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); - /* - * Get a log_item_desc to point at the new item. - */ + /* Reset the per-tx dirty context and add the item to the tx. */ + iip->ili_dirty_flags = 0; xfs_trans_add_item(tp, &iip->ili_item); } @@ -76,17 +75,10 @@ xfs_trans_ichgtime( /* * This is called to mark the fields indicated in fieldmask as needing to be * logged when the transaction is committed. The inode must already be - * associated with the given transaction. - * - * The values for fieldmask are defined in xfs_inode_item.h. We always log all - * of the core inode if any of it has changed, and we always log all of the - * inline data/extents/b-tree root if any of them has changed. - * - * Grab and pin the cluster buffer associated with this inode to avoid RMW - * cycles at inode writeback time. Avoid the need to add error handling to every - * xfs_trans_log_inode() call by shutting down on read error. This will cause - * transactions to fail and everything to error out, just like if we return a - * read error in a dirty transaction and cancel it. + * associated with the given transaction. All we do here is record where the + * inode was dirtied and mark the transaction and inode log item dirty; + * everything else is done in the ->precommit log item operation after the + * changes in the transaction have been completed. */ void xfs_trans_log_inode( @@ -96,7 +88,6 @@ xfs_trans_log_inode( { struct xfs_inode_log_item *iip = ip->i_itemp; struct inode *inode = VFS_I(ip); - uint iversion_flags = 0; ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -105,18 +96,6 @@ xfs_trans_log_inode( tp->t_flags |= XFS_TRANS_DIRTY; /* - * Don't bother with i_lock for the I_DIRTY_TIME check here, as races - * don't matter - we either will need an extra transaction in 24 hours - * to log the timestamps, or will clear already cleared fields in the - * worst case. - */ - if (inode->i_state & I_DIRTY_TIME) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); - } - - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually @@ -128,86 +107,10 @@ xfs_trans_log_inode( if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) - iversion_flags = XFS_ILOG_CORE; - } - - /* - * If we're updating the inode core or the timestamps and it's possible - * to upgrade this inode to bigtime format, do so now. - */ - if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && - xfs_has_bigtime(ip->i_mount) && - !xfs_inode_has_bigtime(ip)) { - ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; - flags |= XFS_ILOG_CORE; - } - - /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + flags |= XFS_ILOG_IVERSION; } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. - */ - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= flags; - - if (!iip->ili_item.li_buf) { - struct xfs_buf *bp; - int error; - - /* - * We hold the ILOCK here, so this inode is not going to be - * flushed while we are here. Further, because there is no - * buffer attached to the item, we know that there is no IO in - * progress, so nothing will clear the ili_fields while we read - * in the buffer. Hence we can safely drop the spin lock and - * read the buffer knowing that the state will not change from - * here. - */ - spin_unlock(&iip->ili_lock); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); - if (error) { - xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); - return; - } - - /* - * We need an explicit buffer reference for the log item but - * don't want the buffer to remain attached to the transaction. - * Hold the buffer but release the transaction reference once - * we've attached the inode log item to the buffer log item - * list. - */ - xfs_buf_hold(bp); - spin_lock(&iip->ili_lock); - iip->ili_item.li_buf = bp; - bp->b_flags |= _XBF_INODES; - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); - xfs_trans_brelse(tp, bp); - } - - /* - * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines - * in the eventual clearing of the ili_fields bits. See the big comment - * in xfs_iflush() for an explanation of this coordination mechanism. - */ - iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); - spin_unlock(&iip->ili_lock); + iip->ili_dirty_flags |= flags; } int diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 69bc89d0fc68..5bf4326e9783 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -769,14 +769,14 @@ xchk_are_bmaps_contiguous( * mapping or false if there are no more mappings. Caller must ensure that * @info.icur is zeroed before the first call. */ -static int +static bool xchk_bmap_iext_iter( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_bmbt_irec got; struct xfs_ifork *ifp; - xfs_filblks_t prev_len; + unsigned int nr = 0; ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); @@ -790,12 +790,12 @@ xchk_bmap_iext_iter( irec->br_startoff); return false; } + nr++; /* * Iterate subsequent iextent records and merge them with the one * that we just read, if possible. */ - prev_len = irec->br_blockcount; while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { if (!xchk_are_bmaps_contiguous(irec, &got)) break; @@ -805,20 +805,21 @@ xchk_bmap_iext_iter( got.br_startoff); return false; } - - /* - * Notify the user of mergeable records in the data or attr - * forks. CoW forks only exist in memory so we ignore them. - */ - if (info->whichfork != XFS_COW_FORK && - prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK) - xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + nr++; irec->br_blockcount += got.br_blockcount; - prev_len = got.br_blockcount; xfs_iext_next(ifp, &info->icur); } + /* + * If the merged mapping could be expressed with fewer bmbt records + * than we actually found, notify the user that this fork could be + * optimized. CoW forks only exist in memory so we ignore them. + */ + if (nr > 1 && info->whichfork != XFS_COW_FORK && + howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + return true; } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index 9d7b9ee8bef4..c32b5fad6174 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -60,7 +60,7 @@ struct xchk_btree { static inline size_t xchk_btree_sizeof(unsigned int nlevels) { - return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1); + return struct_size_t(struct xchk_btree, lastkey, nlevels - 1); } int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index b38e93830dde..e113f2f5c254 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -105,10 +105,10 @@ struct xfs_scrub { }; /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ -#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ -#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ -#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* * The XCHK_FSGATES* flags reflect functionality in the main filesystem that diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2ef78aa1d3f6..451942fb38ec 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -582,7 +582,6 @@ const struct address_space_operations xfs_address_space_operations = { .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, - .direct_IO = noop_direct_IO, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -591,7 +590,6 @@ const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index df7322ed73fa..023d4e0385dd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -452,10 +452,18 @@ xfs_buf_item_format( * This is called to pin the buffer associated with the buf log item in memory * so it cannot be written out. * - * We also always take a reference to the buffer log item here so that the bli - * is held while the item is pinned in memory. This means that we can - * unconditionally drop the reference count a transaction holds when the - * transaction is completed. + * We take a reference to the buffer log item here so that the BLI life cycle + * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and + * inserted into the AIL. + * + * We also need to take a reference to the buffer itself as the BLI unpin + * processing requires accessing the buffer after the BLI has dropped the final + * BLI reference. See xfs_buf_item_unpin() for an explanation. + * If unpins race to drop the final BLI reference and only the + * BLI owns a reference to the buffer, then the loser of the race can have the + * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per + * pin count ensures the life cycle of the buffer extends for as + * long as we hold the buffer pin reference in xfs_buf_item_unpin(). */ STATIC void xfs_buf_item_pin( @@ -470,13 +478,30 @@ xfs_buf_item_pin( trace_xfs_buf_item_pin(bip); + xfs_buf_hold(bip->bli_buf); atomic_inc(&bip->bli_refcount); atomic_inc(&bip->bli_buf->b_pin_count); } /* - * This is called to unpin the buffer associated with the buf log item which - * was previously pinned with a call to xfs_buf_item_pin(). + * This is called to unpin the buffer associated with the buf log item which was + * previously pinned with a call to xfs_buf_item_pin(). We enter this function + * with a buffer pin count, a buffer reference and a BLI reference. + * + * We must drop the BLI reference before we unpin the buffer because the AIL + * doesn't acquire a BLI reference whenever it accesses it. Therefore if the + * refcount drops to zero, the bli could still be AIL resident and the buffer + * submitted for I/O at any point before we return. This can result in IO + * completion freeing the buffer while we are still trying to access it here. + * This race condition can also occur in shutdown situations where we abort and + * unpin buffers from contexts other that journal IO completion. + * + * Hence we have to hold a buffer reference per pin count to ensure that the + * buffer cannot be freed until we have finished processing the unpin operation. + * The reference is taken in xfs_buf_item_pin(), and we must hold it until we + * are done processing the buffer state. In the case of an abort (remove = + * true) then we re-use the current pin reference as the IO reference we hand + * off to IO failure handling. */ STATIC void xfs_buf_item_unpin( @@ -493,24 +518,18 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin(bip); - /* - * Drop the bli ref associated with the pin and grab the hold required - * for the I/O simulation failure in the abort case. We have to do this - * before the pin count drops because the AIL doesn't acquire a bli - * reference. Therefore if the refcount drops to zero, the bli could - * still be AIL resident and the buffer submitted for I/O (and freed on - * completion) at any point before we return. This can be removed once - * the AIL properly holds a reference on the bli. - */ freed = atomic_dec_and_test(&bip->bli_refcount); - if (freed && !stale && remove) - xfs_buf_hold(bp); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); - /* nothing to do but drop the pin count if the bli is active */ - if (!freed) + /* + * Nothing to do but drop the buffer pin reference if the BLI is + * still active. + */ + if (!freed) { + xfs_buf_rele(bp); return; + } if (stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); @@ -523,6 +542,15 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin_stale(bip); /* + * The buffer has been locked and referenced since it was marked + * stale so we own both lock and reference exclusively here. We + * do not need the pin reference any more, so drop it now so + * that we only have one reference to drop once item completion + * processing is complete. + */ + xfs_buf_rele(bp); + + /* * If we get called here because of an IO error, we may or may * not have the item on the AIL. xfs_trans_ail_delete() will * take care of that situation. xfs_trans_ail_delete() drops @@ -538,16 +566,30 @@ xfs_buf_item_unpin( ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); - } else if (remove) { + return; + } + + if (remove) { /* - * The buffer must be locked and held by the caller to simulate - * an async I/O failure. We acquired the hold for this case - * before the buffer was unpinned. + * We need to simulate an async IO failures here to ensure that + * the correct error completion is run on this buffer. This + * requires a reference to the buffer and for the buffer to be + * locked. We can safely pass ownership of the pin reference to + * the IO to ensure that nothing can free the buffer while we + * wait for the lock and then run the IO failure completion. */ xfs_buf_lock(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); + return; } + + /* + * BLI has no more active references - it will be moved to the AIL to + * manage the remaining BLI/buffer life cycle. There is nothing left for + * us to do here so drop the pin reference to the buffer. + */ + xfs_buf_rele(bp); } STATIC uint diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index f3d328e4a440..7c2fdc71e42d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -566,20 +566,45 @@ xfs_extent_busy_clear( /* * Flush out all busy extents for this AG. + * + * If the current transaction is holding busy extents, the caller may not want + * to wait for committed busy extents to resolve. If we are being told just to + * try a flush or progress has been made since we last skipped a busy extent, + * return immediately to allow the caller to try again. + * + * If we are freeing extents, we might actually be holding the only free extents + * in the transaction busy list and the log force won't resolve that situation. + * In this case, we must return -EAGAIN to avoid a deadlock by informing the + * caller it needs to commit the busy extents it holds before retrying the + * extent free operation. */ -void +int xfs_extent_busy_flush( - struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_perag *pag, - unsigned busy_gen) + unsigned busy_gen, + uint32_t alloc_flags) { DEFINE_WAIT (wait); int error; - error = xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); if (error) - return; + return error; + + /* Avoid deadlocks on uncommitted busy extents. */ + if (!list_empty(&tp->t_busy)) { + if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) + return 0; + + if (busy_gen != READ_ONCE(pag->pagb_gen)) + return 0; + + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) + return -EAGAIN; + } + /* Wait for committed busy extents to resolve. */ do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (busy_gen != READ_ONCE(pag->pagb_gen)) @@ -588,6 +613,7 @@ xfs_extent_busy_flush( } while (1); finish_wait(&pag->pagb_wait, &wait); + return 0; } void diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 4a118131059f..c37bf87e6781 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -51,9 +51,9 @@ bool xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen); -void -xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, - unsigned busy_gen); +int +xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, + unsigned busy_gen, uint32_t alloc_flags); void xfs_extent_busy_wait_all(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f9e36b810663..f1a5ecf099aa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -337,6 +337,34 @@ xfs_trans_get_efd( } /* + * Fill the EFD with all extents from the EFI when we need to roll the + * transaction and continue with a new EFI. + * + * This simply copies all the extents in the EFI to the EFD rather than make + * assumptions about which extents in the EFI have already been processed. We + * currently keep the xefi list in the same order as the EFI extent list, but + * that may not always be the case. Copying everything avoids leaving a landmine + * were we fail to cancel all the extents in an EFI if the xefi list is + * processed in a different order to the extents in the EFI. + */ +static void +xfs_efd_from_efi( + struct xfs_efd_log_item *efdp) +{ + struct xfs_efi_log_item *efip = efdp->efd_efip; + uint i; + + ASSERT(efip->efi_format.efi_nextents > 0); + ASSERT(efdp->efd_next_extent < efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + efdp->efd_format.efd_extents[i] = + efip->efi_format.efi_extents[i]; + } + efdp->efd_next_extent = efip->efi_format.efi_nextents; +} + +/* * Free an extent and log it to the EFD. Note that the transaction is marked * dirty regardless of whether the extent free succeeds or fails to support the * EFI/EFD lifecycle rules. @@ -365,7 +393,7 @@ xfs_trans_free_extent( agbno, xefi->xefi_blockcount); error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); /* @@ -378,6 +406,17 @@ xfs_trans_free_extent( tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + /* + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. + */ + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); + return error; + } + next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); @@ -495,6 +534,13 @@ xfs_extent_free_finish_item( error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + /* + * Don't free the XEFI if we need a new transaction to complete + * processing of it. + */ + if (error == -EAGAIN) + return error; + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; @@ -620,6 +666,7 @@ xfs_efi_item_recover( struct xfs_trans *tp; int i; int error = 0; + bool requeue_only = false; /* * First check the validity of the extents described by the @@ -644,6 +691,7 @@ xfs_efi_item_recover( for (i = 0; i < efip->efi_format.efi_nextents; i++) { struct xfs_extent_free_item fake = { .xefi_owner = XFS_RMAP_OWN_UNKNOWN, + .xefi_agresv = XFS_AG_RESV_NONE, }; struct xfs_extent *extp; @@ -652,9 +700,28 @@ xfs_efi_item_recover( fake.xefi_startblock = extp->ext_start; fake.xefi_blockcount = extp->ext_len; - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); + if (!requeue_only) { + xfs_extent_free_get_group(mp, &fake); + error = xfs_trans_free_extent(tp, efdp, &fake); + xfs_extent_free_put_group(&fake); + } + + /* + * If we can't free the extent without potentially deadlocking, + * requeue the rest of the extents to a new so that they get + * run again later with a new transaction context. + */ + if (error == -EAGAIN || requeue_only) { + error = xfs_free_extent_later(tp, fake.xefi_startblock, + fake.xefi_blockcount, + &XFS_RMAP_OINFO_ANY_OWNER, + fake.xefi_agresv); + if (!error) { + requeue_only = true; + continue; + } + } + if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aede746541f8..4f502219ae4f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -306,6 +306,34 @@ xfs_file_read_iter( return ret; } +STATIC ssize_t +xfs_file_splice_read( + struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (xfs_is_shutdown(mp)) + return -EIO; + + trace_xfs_file_splice_read(ip, *ppos, len); + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = filemap_splice_read(in, ppos, pipe, len, flags); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (ret > 0) + XFS_STATS_ADD(mp, xs_read_bytes, ret); + return ret; +} + /* * Common pre-write limit and setup checks. * @@ -717,14 +745,9 @@ write_retry: if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); - if (likely(ret >= 0)) - iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated @@ -753,7 +776,6 @@ write_retry: goto write_retry; } - current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); @@ -1172,7 +1194,7 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_DIO_PARALLEL_WRITE; + FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } @@ -1423,7 +1445,7 @@ const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, - .splice_read = generic_file_splice_read, + .splice_read = xfs_file_splice_read, .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 22c13933c8f8..2fc98d313708 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -78,7 +78,6 @@ restart: *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - xfs_perag_rele(pag); if (err != -EAGAIN) break; /* Couldn't lock the AGF, skip this AG. */ diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 59e7d1a14b67..10403ba9b58f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -160,9 +160,18 @@ struct xfs_getfsmap_info { struct xfs_buf *agf_bp; /* AGF, for refcount queries */ struct xfs_perag *pag; /* AG info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ + /* daddr of low fsmap key when we're using the rtbitmap */ + xfs_daddr_t low_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ - struct xfs_rmap_irec low; /* low rmap key */ + /* + * Low rmap key for the query. If low.rm_blockcount is nonzero, this + * is the second (or later) call to retrieve the recordset in pieces. + * xfs_getfsmap_rec_before_start will compare all records retrieved + * by the rmapbt query to filter out any records that start before + * the last record. + */ + struct xfs_rmap_irec low; struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ }; @@ -237,16 +246,31 @@ xfs_getfsmap_format( xfs_fsmap_from_internal(rec, xfm); } +static inline bool +xfs_getfsmap_rec_before_start( + struct xfs_getfsmap_info *info, + const struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + if (info->low_daddr != -1ULL) + return rec_daddr < info->low_daddr; + if (info->low.rm_blockcount) + return xfs_rmap_compare(rec, &info->low) < 0; + return false; +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock - * into the appropriate daddr units. + * into the appropriate daddr units. Pass in a nonzero @len_daddr if the + * length could be larger than rm_blockcount in struct xfs_rmap_irec. */ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr) + xfs_daddr_t rec_daddr, + xfs_daddr_t len_daddr) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; @@ -256,12 +280,15 @@ xfs_getfsmap_helper( if (fatal_signal_pending(current)) return -EINTR; + if (len_daddr == 0) + len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + /* * Filter out records that start before our startpoint, if the * caller requested that. */ - if (xfs_rmap_compare(rec, &info->low) < 0) { - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) { + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -280,7 +307,7 @@ xfs_getfsmap_helper( info->head->fmh_entries++; - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -320,7 +347,7 @@ xfs_getfsmap_helper( if (error) return error; fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); - fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + fmr.fmr_length = len_daddr; if (rec->rm_flags & XFS_RMAP_UNWRITTEN) fmr.fmr_flags |= FMR_OF_PREALLOC; if (rec->rm_flags & XFS_RMAP_ATTR_FORK) @@ -337,7 +364,7 @@ xfs_getfsmap_helper( xfs_getfsmap_format(mp, &fmr, info); out: - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -358,7 +385,7 @@ xfs_getfsmap_datadev_helper( fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); - return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); + return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0); } /* Transform a bnobt irec into a fsmap */ @@ -382,7 +409,7 @@ xfs_getfsmap_datadev_bnobt_helper( irec.rm_offset = 0; irec.rm_flags = 0; - return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr); + return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0); } /* Set rmap flags based on the getfsmap flags */ @@ -409,31 +436,25 @@ xfs_getfsmap_logdev( { struct xfs_mount *mp = tp->t_mountp; struct xfs_rmap_irec rmap; - int error; + xfs_daddr_t rec_daddr, len_daddr; + xfs_fsblock_t start_fsb, end_fsb; + uint64_t eofs; - /* Set up search keys */ - info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - error = xfs_fsmap_owner_to_rmap(&info->low, keys); - if (error) - return error; - info->low.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + start_fsb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1); - if (error) - return error; - info->high.rm_startblock = -1U; - info->high.rm_owner = ULLONG_MAX; - info->high.rm_offset = ULLONG_MAX; - info->high.rm_blockcount = 0; - info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; - info->missing_owner = XFS_FMR_OWN_FREE; + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) + info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); - trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); - if (keys[0].fmr_physical > 0) + if (start_fsb > 0) return 0; /* Fabricate an rmap entry for the external log device. */ @@ -443,7 +464,9 @@ xfs_getfsmap_logdev( rmap.rm_offset = 0; rmap.rm_flags = 0; - return xfs_getfsmap_helper(tp, info, &rmap, 0); + rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); + len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); + return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); } #ifdef CONFIG_XFS_RT @@ -457,72 +480,58 @@ xfs_getfsmap_rtdev_rtbitmap_helper( { struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; + xfs_rtblock_t rtbno; + xfs_daddr_t rec_daddr, len_daddr; + + rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rec_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_startblock = rtbno; + + rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize; + len_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_blockcount = rtbno; - irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock); - irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ irec.rm_offset = 0; irec.rm_flags = 0; - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); } -/* Execute a getfsmap query against the realtime device. */ +/* Execute a getfsmap query against the realtime device rtbitmap. */ STATIC int -__xfs_getfsmap_rtdev( +xfs_getfsmap_rtdev_rtbitmap( struct xfs_trans *tp, const struct xfs_fsmap *keys, - int (*query_fn)(struct xfs_trans *, - struct xfs_getfsmap_info *), struct xfs_getfsmap_info *info) { + + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; struct xfs_mount *mp = tp->t_mountp; - xfs_fsblock_t start_fsb; - xfs_fsblock_t end_fsb; + xfs_rtblock_t start_rtb; + xfs_rtblock_t end_rtb; uint64_t eofs; - int error = 0; + int error; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize); if (keys[0].fmr_physical >= eofs) return 0; - start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); + start_rtb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - /* Set up search keys */ - info->low.rm_startblock = start_fsb; - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); - if (error) - return error; - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - info->low.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); - - info->high.rm_startblock = end_fsb; - error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); - if (error) - return error; - info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); - info->high.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); - - trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); + info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return query_fn(tp, info); -} + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); + if (info->low_daddr >= eofs) + return 0; + } -/* Actually query the realtime bitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap_query( - struct xfs_trans *tp, - struct xfs_getfsmap_info *info) -{ - struct xfs_rtalloc_rec alow = { 0 }; - struct xfs_rtalloc_rec ahigh = { 0 }; - struct xfs_mount *mp = tp->t_mountp; - int error; + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); @@ -530,8 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap_query( * Set up query parameters to return free rtextents covering the range * we want. */ - alow.ar_startext = info->low.rm_startblock; - ahigh.ar_startext = info->high.rm_startblock; + alow.ar_startext = start_rtb; + ahigh.ar_startext = end_rtb; do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; @@ -554,18 +563,6 @@ err: xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } - -/* Execute a getfsmap query against the realtime device rtbitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap( - struct xfs_trans *tp, - const struct xfs_fsmap *keys, - struct xfs_getfsmap_info *info) -{ - info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, - info); -} #endif /* CONFIG_XFS_RT */ /* Execute a getfsmap query against the regular data device. */ @@ -606,9 +603,27 @@ __xfs_getfsmap_datadev( error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) return error; - info->low.rm_blockcount = 0; + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + /* Adjust the low key if we are continuing from where we left off. */ + if (info->low.rm_blockcount == 0) { + /* empty */ + } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || + (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN))) { + info->low.rm_startblock += info->low.rm_blockcount; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + + start_fsb += info->low.rm_blockcount; + if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) + return 0; + } else { + info->low.rm_offset += info->low.rm_blockcount; + } + info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; info->high.rm_offset = ULLONG_MAX; @@ -659,12 +674,8 @@ __xfs_getfsmap_datadev( * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (pag->pag_agno == start_ag) { - info->low.rm_startblock = 0; - info->low.rm_owner = 0; - info->low.rm_offset = 0; - info->low.rm_flags = 0; - } + if (pag->pag_agno == start_ag) + memset(&info->low, 0, sizeof(info->low)); /* * If this is the last AG, report any gap at the end of it @@ -791,6 +802,19 @@ xfs_getfsmap_check_keys( struct xfs_fsmap *low_key, struct xfs_fsmap *high_key) { + if (low_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { + if (low_key->fmr_offset) + return false; + } + if (high_key->fmr_flags != -1U && + (high_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | + FMR_OF_EXTENT_MAP))) { + if (high_key->fmr_offset && high_key->fmr_offset != -1ULL) + return false; + } + if (high_key->fmr_length && high_key->fmr_length != -1ULL) + return false; + if (low_key->fmr_device > high_key->fmr_device) return false; if (low_key->fmr_device < high_key->fmr_device) @@ -834,15 +858,15 @@ xfs_getfsmap_check_keys( * ---------------- * There are multiple levels of keys and counters at work here: * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in; - * these reflect fs-wide sector addrs. + * these reflect fs-wide sector addrs. * dkeys -- fmh_keys used to query each device; - * these are fmh_keys but w/ the low key - * bumped up by fmr_length. + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this * is how we detect gaps in the fsmap records and report them. * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from - * dkeys; used to query the metadata. + * dkeys; used to query the metadata. */ int xfs_getfsmap( @@ -863,6 +887,8 @@ xfs_getfsmap( if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) || !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + if (!xfs_getfsmap_check_keys(&head->fmh_keys[0], &head->fmh_keys[1])) + return -EINVAL; use_rmap = xfs_has_rmapbt(mp) && has_capability_noaudit(current, CAP_SYS_ADMIN); @@ -901,26 +927,15 @@ xfs_getfsmap( * blocks could be mapped to several other files/offsets. * According to rmapbt record ordering, the minimal next * possible record for the block range is the next starting - * offset in the same inode. Therefore, bump the file offset to - * continue the search appropriately. For all other low key - * mapping types (attr blocks, metadata), bump the physical - * offset as there can be no other mapping for the same physical - * block range. + * offset in the same inode. Therefore, each fsmap backend bumps + * the file offset to continue the search appropriately. For + * all other low key mapping types (attr blocks, metadata), each + * fsmap backend bumps the physical offset as there can be no + * other mapping for the same physical block range. */ dkeys[0] = head->fmh_keys[0]; - if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { - dkeys[0].fmr_physical += dkeys[0].fmr_length; - dkeys[0].fmr_owner = 0; - if (dkeys[0].fmr_offset) - return -EINVAL; - } else - dkeys[0].fmr_offset += dkeys[0].fmr_length; - dkeys[0].fmr_length = 0; memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap)); - if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) - return -EINVAL; - info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; info.fsmap_recs = fsmap_recs; @@ -960,6 +975,8 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; info.pag = NULL; + info.low_daddr = -1ULL; + info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) break; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 13851c0d640b..7cb75cb6b8e9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -93,7 +93,7 @@ xfs_growfs_data_private( xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_div, nb_mod; int64_t delta; - bool lastag_extended; + bool lastag_extended = false; xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; @@ -115,11 +115,16 @@ xfs_growfs_data_private( nb_div = nb; nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); - nagcount = nb_div + (nb_mod != 0); - if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { - nagcount--; - nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; + if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) + nb_div++; + else if (nb_mod) + nb = nb_div * mp->m_sb.sb_agblocks; + + if (nb_div > XFS_MAX_AGNUMBER + 1) { + nb_div = XFS_MAX_AGNUMBER + 1; + nb = nb_div * mp->m_sb.sb_agblocks; } + nagcount = nb_div; delta = nb - mp->m_sb.sb_dblocks; /* * Reject filesystems with a single AG because they are not @@ -140,9 +145,13 @@ xfs_growfs_data_private( return -EINVAL; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, - (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0, - XFS_TRANS_RESERVE, &tp); + if (delta > 0) + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + else + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, + 0, &tp); if (error) return error; @@ -534,6 +543,9 @@ xfs_do_force_shutdown( } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of on-disk metadata"; + } else if (flags & SHUTDOWN_DEVICE_REMOVED) { + tag = XFS_PTAG_SHUTDOWN_IOERROR; + why = "Block device removal"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0f60e301eb1f..453890942d9f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -454,6 +454,27 @@ xfs_inodegc_queue_all( return ret; } +/* Wait for all queued work and collect errors */ +static int +xfs_inodegc_wait_all( + struct xfs_mount *mp) +{ + int cpu; + int error = 0; + + flush_workqueue(mp->m_inodegc_wq); + for_each_online_cpu(cpu) { + struct xfs_inodegc *gc; + + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (gc->error && !error) + error = gc->error; + gc->error = 0; + } + + return error; +} + /* * Check the validity of the inode we just found it the cache */ @@ -1491,15 +1512,14 @@ xfs_blockgc_free_space( if (error) return error; - xfs_inodegc_flush(mp); - return 0; + return xfs_inodegc_flush(mp); } /* * Reclaim all the free space that we can by scheduling the background blockgc * and inodegc workers immediately and waiting for them all to clear. */ -void +int xfs_blockgc_flush_all( struct xfs_mount *mp) { @@ -1520,7 +1540,7 @@ xfs_blockgc_flush_all( for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) flush_delayed_work(&pag->pag_blockgc_work); - xfs_inodegc_flush(mp); + return xfs_inodegc_flush(mp); } /* @@ -1842,13 +1862,17 @@ xfs_inodegc_set_reclaimable( * This is the last chance to make changes to an otherwise unreferenced file * before incore reclamation happens. */ -static void +static int xfs_inodegc_inactivate( struct xfs_inode *ip) { + int error; + trace_xfs_inode_inactivating(ip); - xfs_inactive(ip); + error = xfs_inactive(ip); xfs_inodegc_set_reclaimable(ip); + return error; + } void @@ -1880,8 +1904,12 @@ xfs_inodegc_worker( WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { + int error; + xfs_iflags_set(ip, XFS_INACTIVATING); - xfs_inodegc_inactivate(ip); + error = xfs_inodegc_inactivate(ip); + if (error && !gc->error) + gc->error = error; } memalloc_nofs_restore(nofs_flag); @@ -1905,13 +1933,13 @@ xfs_inodegc_push( * Force all currently queued inode inactivation work to run immediately and * wait for the work to finish. */ -void +int xfs_inodegc_flush( struct xfs_mount *mp) { xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - flush_workqueue(mp->m_inodegc_wq); + return xfs_inodegc_wait_all(mp); } /* diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 87910191a9dd..1dcdcb23796e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,7 +62,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); -void xfs_blockgc_flush_all(struct xfs_mount *mp); +int xfs_blockgc_flush_all(struct xfs_mount *mp); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); @@ -80,7 +80,7 @@ void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); void xfs_inodegc_push(struct xfs_mount *mp); -void xfs_inodegc_flush(struct xfs_mount *mp); +int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5808abab786c..9e62cc500140 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1620,16 +1620,7 @@ xfs_inactive_ifree( */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - /* - * Just ignore errors at this point. There is nothing we can do except - * to try to keep going. Make sure it's not a silent error. - */ - error = xfs_trans_commit(tp); - if (error) - xfs_notice(mp, "%s: xfs_trans_commit returned error %d", - __func__, error); - - return 0; + return xfs_trans_commit(tp); } /* @@ -1693,12 +1684,12 @@ xfs_inode_needs_inactive( * now be truncated. Also, we clear all of the read-ahead state * kept for the inode here since the file is now closed. */ -void +int xfs_inactive( xfs_inode_t *ip) { struct xfs_mount *mp; - int error; + int error = 0; int truncate = 0; /* @@ -1736,7 +1727,7 @@ xfs_inactive( * reference to the inode at this point anyways. */ if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(ip); + error = xfs_free_eofblocks(ip); goto out; } @@ -1773,7 +1764,7 @@ xfs_inactive( /* * Free the inode. */ - xfs_inactive_ifree(ip); + error = xfs_inactive_ifree(ip); out: /* @@ -1781,6 +1772,7 @@ out: * the attached dquots. */ xfs_qm_dqdetach(ip); + return error; } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 69d21e42c10a..7547caf2f2ab 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -470,7 +470,7 @@ enum layout_break_reason { (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_release(struct xfs_inode *ip); -void xfs_inactive(struct xfs_inode *ip); +int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct mnt_idmap *idmap, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index ca2941ab6cbc..91c847a84e10 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } +static uint64_t +xfs_inode_item_sort( + struct xfs_log_item *lip) +{ + return INODE_ITEM(lip)->ili_inode->i_ino; +} + +/* + * Prior to finally logging the inode, we have to ensure that all the + * per-modification inode state changes are applied. This includes VFS inode + * state updates, format conversions, verifier state synchronisation and + * ensuring the inode buffer remains in memory whilst the inode is dirty. + * + * We have to be careful when we grab the inode cluster buffer due to lock + * ordering constraints. The unlinked inode modifications (xfs_iunlink_item) + * require AGI -> inode cluster buffer lock order. The inode cluster buffer is + * not locked until ->precommit, so it happens after everything else has been + * modified. + * + * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we + * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we + * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because + * it can be called on a inode (e.g. via bumplink/droplink) before we take the + * AGF lock modifying directory blocks. + * + * Rather than force a complete rework of all the transactions to call + * xfs_trans_log_inode() once and once only at the end of every transaction, we + * move the pinning of the inode cluster buffer to a ->precommit operation. This + * matches how the xfs_iunlink_item locks the inode cluster buffer, and it + * ensures that the inode cluster buffer locking is always done last in a + * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode + * cluster buffer. + * + * If we return the inode number as the precommit sort key then we'll also + * guarantee that the order all inode cluster buffer locking is the same all the + * inodes and unlink items in the transaction. + */ +static int +xfs_inode_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct inode *inode = VFS_I(ip); + unsigned int flags = iip->ili_dirty_flags; + + /* + * Don't bother with i_lock for the I_DIRTY_TIME check here, as races + * don't matter - we either will need an extra transaction in 24 hours + * to log the timestamps, or will clear already cleared fields in the + * worst case. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + } + + /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_has_bigtime(ip->i_mount) && + !xfs_inode_has_bigtime(ip)) { + ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it + * to XFS_ILOG_CORE so that the actual on-disk dirty tracking + * (ili_fields) correctly tracks that the version has changed. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); + if (error) + return error; + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } + + /* + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. + */ + iip->ili_fields |= (flags | iip->ili_last_fields); + spin_unlock(&iip->ili_lock); + + /* + * We are done with the log item transaction dirty state, so clear it so + * that it doesn't pollute future transactions. + */ + iip->ili_dirty_flags = 0; + return 0; +} + /* * The logged size of an inode fork is always the current size of the inode * fork. This means that when an inode fork is relogged, the size of the logged @@ -662,6 +809,8 @@ xfs_inode_item_committing( } static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_sort = xfs_inode_item_sort, + .iop_precommit = xfs_inode_item_precommit, .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index bbd836a44ff0..377e06007804 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -17,6 +17,7 @@ struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ unsigned short ili_lock_flags; /* inode lock flags */ + unsigned int ili_dirty_flags; /* dirty in current tx */ /* * The ili_lock protects the interactions between the dirty state and * the flush state of the inode log item. This allows us to do atomic diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fc61cc024023..79004d193e54 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -639,7 +639,6 @@ xfs_log_mount( int num_bblks) { struct xlog *log; - bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; @@ -663,53 +662,37 @@ xfs_log_mount( mp->m_log = log; /* - * Validate the given log space and drop a critical message via syslog - * if the log size is too small that would lead to some unexpected - * situations in transaction log space reservation stage. + * Now that we have set up the log and it's internal geometry + * parameters, we can validate the given log space and drop a critical + * message via syslog if the log size is too small. A log that is too + * small can lead to unexpected situations in transaction log space + * reservation stage. The superblock verifier has already validated all + * the other log geometry constraints, so we don't have to check those + * here. * - * Note: we can't just reject the mount if the validation fails. This - * would mean that people would have to downgrade their kernel just to - * remedy the situation as there is no way to grow the log (short of - * black magic surgery with xfs_db). + * Note: For v4 filesystems, we can't just reject the mount if the + * validation fails. This would mean that people would have to + * downgrade their kernel just to remedy the situation as there is no + * way to grow the log (short of black magic surgery with xfs_db). * - * We can, however, reject mounts for CRC format filesystems, as the + * We can, however, reject mounts for V5 format filesystems, as the * mkfs binary being used to make the filesystem should never create a * filesystem with a log that is too small. */ min_logfsbs = xfs_log_calc_minimum_size(mp); - if (mp->m_sb.sb_logblocks < min_logfsbs) { xfs_warn(mp, "Log size %d blocks too small, minimum size is %d blocks", mp->m_sb.sb_logblocks, min_logfsbs); - error = -EINVAL; - } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { - xfs_warn(mp, - "Log size %d blocks too large, maximum size is %lld blocks", - mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); - error = -EINVAL; - } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { - xfs_warn(mp, - "log size %lld bytes too large, maximum size is %lld bytes", - XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), - XFS_MAX_LOG_BYTES); - error = -EINVAL; - } else if (mp->m_sb.sb_logsunit > 1 && - mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { - xfs_warn(mp, - "log stripe unit %u bytes must be a multiple of block size", - mp->m_sb.sb_logsunit); - error = -EINVAL; - fatal = true; - } - if (error) { + /* * Log check errors are always fatal on v5; or whenever bad * metadata leads to a crash. */ - if (fatal) { + if (xfs_has_crc(mp)) { xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); ASSERT(0); + error = -EINVAL; goto out_free_log; } xfs_crit(mp, "Log size out of supported range."); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 322eb2ee6c55..82c81d20459d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket( * just to flush the inodegc queue and wait for it to * complete. */ - xfs_inodegc_flush(mp); + error = xfs_inodegc_flush(mp); + if (error) + break; } prev_agino = agino; @@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket( } if (prev_ip) { + int error2; + ip->i_prev_unlinked = prev_agino; xfs_irele(prev_ip); + + error2 = xfs_inodegc_flush(mp); + if (error2 && !error) + return error2; } - xfs_inodegc_flush(mp); return error; } @@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag( * bucket and remaining inodes on it unreferenced and * unfreeable. */ - xfs_inodegc_flush(pag->pag_mount); xlog_recover_clear_agi_bucket(pag, bucket); } } @@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks( for_each_perag(log->l_mp, agno, pag) xlog_recover_iunlink_ag(pag); - - /* - * Flush the pending unlinked inodes to ensure that the inactivations - * are fully completed on disk and the incore inodes can be reclaimed - * before we signal that recovery is complete. - */ - xfs_inodegc_flush(log->l_mp); } STATIC void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index aaaf5ec13492..e2866e7fa60c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -62,6 +62,7 @@ struct xfs_error_cfg { struct xfs_inodegc { struct llist_head list; struct delayed_work work; + int error; /* approximate count of inodes in the list */ unsigned int items; @@ -457,12 +458,14 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, #define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ +#define SHUTDOWN_DEVICE_REMOVED (1u << 5) /* device removed underneath us */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \ { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \ - { SHUTDOWN_CORRUPT_INCORE, "corruption" } + { SHUTDOWN_CORRUPT_INCORE, "corruption" }, \ + { SHUTDOWN_DEVICE_REMOVED, "device_removed" } /* * Flags for xfs_mountfs diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index c4078d0ec108..4a9bbd3fe120 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -114,7 +114,8 @@ xfs_dax_notify_ddev_failure( int error = 0; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); - xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, + daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); error = xfs_trans_alloc_empty(mp, &tp); @@ -210,7 +211,7 @@ xfs_dax_notify_failure( ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; /* Ignore the range out of filesystem area */ - if (offset + len < ddev_start) + if (offset + len - 1 < ddev_start) return -ENXIO; if (offset > ddev_end) return -ENXIO; @@ -222,8 +223,8 @@ xfs_dax_notify_failure( len -= ddev_start - offset; offset = 0; } - if (offset + len > ddev_end) - len -= ddev_end - offset; + if (offset + len - 1 > ddev_end) + len = ddev_end - offset + 1; return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), mf_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f5dc46ce9803..eb9102453aff 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -616,8 +616,11 @@ xfs_reflink_cancel_cow_blocks( xfs_refcount_free_cow_extent(*tpp, del.br_startblock, del.br_blockcount); - xfs_free_extent_later(*tpp, del.br_startblock, - del.br_blockcount, NULL); + error = xfs_free_extent_later(*tpp, del.br_startblock, + del.br_blockcount, NULL, + XFS_AG_RESV_NONE); + if (error) + break; /* Roll the transaction */ error = xfs_defer_finish(tpp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7e706255f165..818510243130 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -377,6 +377,17 @@ disable_dax: return 0; } +static void +xfs_bdev_mark_dead( + struct block_device *bdev) +{ + xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); +} + +static const struct blk_holder_ops xfs_holder_ops = { + .mark_dead = xfs_bdev_mark_dead, +}; + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -385,8 +396,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - mp); + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, + &xfs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -397,10 +408,11 @@ xfs_blkdev_get( STATIC void xfs_blkdev_put( + struct xfs_mount *mp, struct block_device *bdev) { if (bdev) - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, mp); } STATIC void @@ -411,13 +423,13 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(logdev); + xfs_blkdev_put(mp, logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(rtdev); + xfs_blkdev_put(mp, rtdev); } xfs_free_buftarg(mp->m_ddev_targp); } @@ -492,10 +504,10 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(rtdev); + xfs_blkdev_put(mp, rtdev); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(logdev); + xfs_blkdev_put(mp, logdev); return error; } @@ -1100,6 +1112,7 @@ xfs_inodegc_init_percpu( #endif init_llist_head(&gc->list); gc->items = 0; + gc->error = 0; INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; @@ -1159,6 +1172,13 @@ xfs_fs_free_cached_objects( return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); } +static void +xfs_fs_shutdown( + struct super_block *sb) +{ + xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1172,6 +1192,7 @@ static const struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, + .shutdown = xfs_fs_shutdown, }; static int @@ -1686,10 +1707,6 @@ xfs_fs_fill_super( goto out_filestream_unmount; } - if (xfs_has_large_extent_counts(mp)) - xfs_warn(mp, - "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index cd4ca5b1fcb0..f3cc204bb4bf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1445,7 +1445,6 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1535,6 +1534,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); +DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -3623,6 +3623,31 @@ DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); +DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), + TP_ARGS(mp, keydev, bno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_fsblock_t, bno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->bno = bno; + ), + TP_printk("dev %d:%d keydev %d:%d bno 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->bno) +) +#define DEFINE_FSMAP_LINEAR_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_linear_class, name, \ + TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \ + TP_ARGS(mp, keydev, bno)) +DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear); +DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear); + DECLARE_EVENT_CLASS(xfs_getfsmap_class, TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), TP_ARGS(mp, fsmap), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8afc0c080861..8c0bfc9a33b1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -290,7 +290,9 @@ retry: * Do not perform a synchronous scan because callers can hold * other locks. */ - xfs_blockgc_flush_all(mp); + error = xfs_blockgc_flush_all(mp); + if (error) + return error; want_retry = false; goto retry; } @@ -970,6 +972,11 @@ __xfs_trans_commit( error = xfs_defer_finish_noroll(&tp); if (error) goto out_unreserve; + + /* Run precommits from final tx in defer chain. */ + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 7d4109af193e..1098452e7f95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -823,7 +823,7 @@ xfs_trans_ail_update_bulk( trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; - list_add(&lip->li_ail, &tmp); + list_add_tail(&lip->li_ail, &tmp); } if (!list_empty(&tmp)) |