diff options
Diffstat (limited to 'fs/xfs/scrub')
98 files changed, 32978 insertions, 1509 deletions
diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c new file mode 100644 index 000000000000..573e4e062754 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "bitmap.h" +#include "scrub/agb_bitmap.h" + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h new file mode 100644 index 000000000000..e488e1f4f63d --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.h @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGB_BITMAP_H__ +#define __XFS_SCRUB_AGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap32 agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->agbitmap, start, len); +} + +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->agbitmap, start, len); +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap32_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap32_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b) +{ + return xbitmap32_count_set_regions(&b->agbitmap); +} + +#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 6c6e5eba42c8..0f2f1852d58f 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -15,6 +15,7 @@ #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -59,6 +60,30 @@ xchk_superblock_xref( } /* + * Calculate the ondisk superblock size in bytes given the feature set of the + * mounted filesystem (aka the primary sb). This is subtlely different from + * the logic in xfs_repair, which computes the size of a secondary sb given the + * featureset listed in the secondary sb. + */ +STATIC size_t +xchk_superblock_ondisk_size( + struct xfs_mount *mp) +{ + if (xfs_has_metauuid(mp)) + return offsetofend(struct xfs_dsb, sb_meta_uuid); + if (xfs_has_crc(mp)) + return offsetofend(struct xfs_dsb, sb_lsn); + if (xfs_sb_version_hasmorebits(&mp->m_sb)) + return offsetofend(struct xfs_dsb, sb_bad_features2); + if (xfs_has_logv2(mp)) + return offsetofend(struct xfs_dsb, sb_logsunit); + if (xfs_has_sector(mp)) + return offsetofend(struct xfs_dsb, sb_logsectsize); + /* only support dirv2 or more recent */ + return offsetofend(struct xfs_dsb, sb_dirblklog); +} + +/* * Scrub the filesystem superblock. * * Note: We do /not/ attempt to check AG 0's superblock. Mount is @@ -74,6 +99,7 @@ xchk_superblock( struct xfs_buf *bp; struct xfs_dsb *sb; struct xfs_perag *pag; + size_t sblen; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -165,8 +191,7 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); /* Check sb_versionnum bits that are set at mkfs time. */ - vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | - XFS_SB_VERSION_NUMBITS | + vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALIGNBIT | XFS_SB_VERSION_DALIGNBIT | XFS_SB_VERSION_SHAREDBIT | @@ -350,8 +375,8 @@ xchk_superblock( } /* Everything else must be zero. */ - if (memchr_inv(sb + 1, 0, - BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + sblen = xchk_superblock_ondisk_size(mp); + if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen)) xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); @@ -434,7 +459,7 @@ xchk_agf_xref_btreeblks( { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; - xfs_agblock_t blocks; + xfs_filblks_t blocks; xfs_agblock_t btreeblks; int error; @@ -483,7 +508,7 @@ xchk_agf_xref_refcblks( struct xfs_scrub *sc) { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; if (!sc->sa.refc_cur) @@ -556,28 +581,28 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + agbno = be32_to_cpu(agf->agf_bno_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + agbno = be32_to_cpu(agf->agf_cnt_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + level = be32_to_cpu(agf->agf_bno_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + level = be32_to_cpu(agf->agf_cnt_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (xfs_has_rmapbt(mp)) { - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + agbno = be32_to_cpu(agf->agf_rmap_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + level = be32_to_cpu(agf->agf_rmap_level); if (level <= 0 || level > mp->m_rmap_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); } @@ -816,7 +841,7 @@ xchk_agi_xref_fiblocks( struct xfs_scrub *sc) { struct xfs_agi *agi = sc->sa.agi_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error = 0; if (!xfs_has_inobtcounts(sc->mp)) @@ -865,6 +890,43 @@ xchk_agi_xref( /* scrub teardown will take care of sc->sa for us */ } +/* + * Check the unlinked buckets for links to bad inodes. We hold the AGI, so + * there cannot be any threads updating unlinked list pointers in this AG. + */ +STATIC void +xchk_iunlink( + struct xfs_scrub *sc, + struct xfs_agi *agi) +{ + unsigned int i; + struct xfs_inode *ip; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]); + + while (agino != NULLAGINO) { + if (agino % XFS_AGI_UNLINKED_BUCKETS != i) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + agino = ip->i_next_unlinked; + } + } +} + /* Scrub the AGI. */ int xchk_agi( @@ -949,6 +1011,8 @@ xchk_agi( if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xchk_iunlink(sc, agi); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 058b6c305224..69b003259784 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -21,12 +21,18 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" +#include "xfs_inode.h" +#include "xfs_iunlink_item.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/agino_bitmap.h" #include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Superblock */ @@ -72,7 +78,7 @@ xrep_superblock( /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); - return error; + return 0; } /* AGF */ @@ -173,8 +179,7 @@ xrep_agf_find_btrees( * We relied on the rmapbt to reconstruct the AGF. If we get a * different root then something's seriously wrong. */ - if (fab[XREP_AGF_RMAPBT].root != - be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi])) + if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root)) return -EFSCORRUPTED; /* We must find the refcountbt root if that feature is enabled. */ @@ -223,20 +228,14 @@ xrep_agf_set_roots( struct xfs_agf *agf, struct xrep_find_ag_btree *fab) { - agf->agf_roots[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].root); - agf->agf_levels[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].height); + agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root); + agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height); - agf->agf_roots[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].root); - agf->agf_levels[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].height); + agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root); + agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height); - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].root); - agf->agf_levels[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].height); + agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root); + agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height); if (xfs_has_reflink(sc->mp)) { agf->agf_refcount_root = @@ -257,12 +256,11 @@ xrep_agf_calc_from_btrees( struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; /* Update the AGF counters from the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) goto err; @@ -275,8 +273,7 @@ xrep_agf_calc_from_btrees( agf->agf_longest = cpu_to_be32(raa.longest); /* Update the AGF counters from the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -332,16 +329,13 @@ xrep_agf_commit_new( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGF. v5 filesystems only. */ @@ -494,12 +488,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - uint64_t start, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -559,16 +552,14 @@ xrep_agfl_collect_blocks( goto out_bmp; /* Find all blocks currently being used by the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; /* Find all blocks currently being used by the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) @@ -647,8 +638,8 @@ struct xrep_agfl_fill { /* Fill the AGFL with whatever blocks are in this extent. */ static int xrep_agfl_fill( - uint64_t start, - uint64_t len, + uint32_t start, + uint32_t len, void *priv) { struct xrep_agfl_fill *af = priv; @@ -789,6 +780,9 @@ xrep_agfl( /* Dump any AGFL overflow. */ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); + if (error) + goto err; + err: xagb_bitmap_destroy(&agfl_extents); return error; @@ -807,15 +801,57 @@ enum { XREP_AGI_MAX }; +#define XREP_AGI_LOOKUP_BATCH 32 + +struct xrep_agi { + struct xfs_scrub *sc; + + /* AGI buffer, tracked separately */ + struct xfs_buf *agi_bp; + + /* context for finding btree roots */ + struct xrep_find_ag_btree fab[XREP_AGI_MAX]; + + /* old AGI contents in case we have to revert */ + struct xfs_agi old_agi; + + /* bitmap of which inodes are unlinked */ + struct xagino_bitmap iunlink_bmp; + + /* heads of the unlinked inode bucket lists */ + xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS]; + + /* scratchpad for batched lookups of the radix tree */ + struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH]; + + /* Map of ino -> next_ino for unlinked inode processing. */ + struct xfarray *iunlink_next; + + /* Map of ino -> prev_ino for unlinked inode processing. */ + struct xfarray *iunlink_prev; +}; + +static void +xrep_agi_buf_cleanup( + void *buf) +{ + struct xrep_agi *ragi = buf; + + xfarray_destroy(ragi->iunlink_prev); + xfarray_destroy(ragi->iunlink_next); + xagino_bitmap_destroy(&ragi->iunlink_bmp); +} + /* * Given the inode btree roots described by *fab, find the roots, check them * for sanity, and pass the root data back out via *fab. */ STATIC int xrep_agi_find_btrees( - struct xfs_scrub *sc, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xrep_find_ag_btree *fab = ragi->fab; struct xfs_buf *agf_bp; struct xfs_mount *mp = sc->mp; int error; @@ -848,10 +884,11 @@ xrep_agi_find_btrees( */ STATIC void xrep_agi_init_header( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp, - struct xfs_agi *old_agi) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_agi *old_agi = &ragi->old_agi; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; @@ -867,10 +904,6 @@ xrep_agi_init_header( if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - /* We don't know how to fix the unlinked list yet. */ - memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked, - sizeof(agi->agi_unlinked)); - /* Mark the incore AGF data stale until we're done fixing things. */ ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); @@ -879,10 +912,12 @@ xrep_agi_init_header( /* Set btree root information in an AGI. */ STATIC void xrep_agi_set_roots( - struct xfs_scrub *sc, - struct xfs_agi *agi, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = ragi->agi_bp->b_addr; + struct xrep_find_ag_btree *fab = ragi->fab; + agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); @@ -895,9 +930,10 @@ xrep_agi_set_roots( /* Update the AGI counters. */ STATIC int xrep_agi_calc_from_btrees( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_btree_cur *cur; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; @@ -905,12 +941,12 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; if (xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; error = xfs_btree_count_blocks(cur, &blocks); if (error) @@ -923,10 +959,9 @@ xrep_agi_calc_from_btrees( agi->agi_freecount = cpu_to_be32(freecount); if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, - XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -940,12 +975,721 @@ err: return error; } +/* + * Record a forwards unlinked chain pointer from agino -> next_agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_next( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t next_agino) +{ + ASSERT(next_agino != 0); + + return xfarray_store(ragi->iunlink_next, agino, &next_agino); +} + +/* + * Record a backwards unlinked chain pointer from prev_ino <- agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_prev( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t prev_agino) +{ + ASSERT(prev_agino != 0); + + return xfarray_store(ragi->iunlink_prev, agino, &prev_agino); +} + +/* + * Given an @agino, look up the next inode in the iunlink bucket. Returns + * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory + * like it should be, or a per-AG inode number. + */ +static inline xfs_agino_t +xrep_iunlink_next( + struct xfs_scrub *sc, + xfs_agino_t agino) +{ + struct xfs_inode *ip; + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) + return 0; + + return ip->i_next_unlinked; +} + +/* + * Load the inode @agino into memory, set its i_prev_unlinked, and drop the + * inode so it can be inactivated. Returns NULLAGINO if we're at the end of + * the chain or if we should stop walking the chain due to corruption; or a + * per-AG inode number. + */ +STATIC xfs_agino_t +xrep_iunlink_reload_next( + struct xrep_agi *ragi, + xfs_agino_t prev_agino, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_ino_t ino; + xfs_agino_t ret = NULLAGINO; + int error; + + ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino); + error = xchk_iget(ragi->sc, ino, &ip); + if (error) + return ret; + + trace_xrep_iunlink_reload_next(ip, prev_agino); + + /* If this is a linked inode, stop processing the chain. */ + if (VFS_I(ip)->i_nlink != 0) { + xrep_iunlink_store_next(ragi, agino, NULLAGINO); + goto rele; + } + + ip->i_prev_unlinked = prev_agino; + ret = ip->i_next_unlinked; + + /* + * Drop the inode reference that we just took. We hold the AGI, so + * this inode cannot move off the unlinked list and hence cannot be + * reclaimed. + */ +rele: + xchk_irele(sc, ip); + return ret; +} + +/* + * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that + * still existed at mount time. This can happen if iunlink processing fails + * during log recovery. + */ +STATIC int +xrep_iunlink_walk_ondisk_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino; + int error = 0; + + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + xfs_agino_t agino = next_agino; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket, + prev_agino, agino); + + if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS) + break; + + next_agino = xrep_iunlink_next(sc, agino); + if (!next_agino) + next_agino = xrep_iunlink_reload_next(ragi, prev_agino, + agino); + + prev_agino = agino; + } + + return 0; +} + +/* Decide if this is an unlinked inode in this AG. */ +STATIC bool +xrep_iunlink_igrab( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = pag->pag_mount; + + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + return false; + + if (!xfs_inode_on_unlinked_list(ip)) + return false; + + return true; +} + +/* + * Mark the given inode in the lookup batch in our unlinked inode bitmap, and + * remember if this inode is the start of the unlinked chain. + */ +STATIC int +xrep_iunlink_visit( + struct xrep_agi *ragi, + unsigned int batch_idx) +{ + struct xfs_mount *mp = ragi->sc->mp; + struct xfs_inode *ip = ragi->lookup_batch[batch_idx]; + xfs_agino_t agino; + unsigned int bucket; + int error; + + ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno); + ASSERT(xfs_inode_on_unlinked_list(ip)); + + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket, + ragi->iunlink_heads[bucket], ip); + + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + if (error) + return error; + + if (ip->i_prev_unlinked == NULLAGINO) { + if (ragi->iunlink_heads[bucket] == NULLAGINO) + ragi->iunlink_heads[bucket] = agino; + } + + return 0; +} + +/* + * Find all incore unlinked inodes so that we can rebuild the unlinked buckets. + * We hold the AGI so there should not be any modifications to the unlinked + * list. + */ +STATIC int +xrep_iunlink_mark_incore( + struct xrep_agi *ragi) +{ + struct xfs_perag *pag = ragi->sc->sa.pag; + struct xfs_mount *mp = pag->pag_mount; + uint32_t first_index = 0; + bool done = false; + unsigned int nr_found = 0; + + do { + unsigned int i; + int error = 0; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + rcu_read_lock(); + + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)&ragi->lookup_batch, first_index, + XREP_AGI_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + return 0; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = ragi->lookup_batch[i]; + + if (done || !xrep_iunlink_igrab(pag, ip)) + ragi->lookup_batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!ragi->lookup_batch[i]) + continue; + error = xrep_iunlink_visit(ragi, i); + if (error) + return error; + } + } while (!done); + + return 0; +} + +/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */ +STATIC int +xrep_iunlink_mark_ondisk_rec( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_agi *ragi = priv; + struct xfs_scrub *sc = ragi->sc; + struct xfs_mount *mp = cur->bc_mp; + xfs_agino_t agino; + unsigned int i; + int error = 0; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + for (i = 0, agino = irec.ir_startino; + i < XFS_INODES_PER_CHUNK; + i++, agino++) { + struct xfs_inode *ip; + unsigned int len = 1; + + /* Skip free inodes */ + if (XFS_INOBT_MASK(i) & irec.ir_free) + continue; + /* Skip inodes we've seen before */ + if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len)) + continue; + + /* + * Skip incore inodes; these were already picked up by + * the _mark_incore step. + */ + rcu_read_lock(); + ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino); + rcu_read_unlock(); + if (ip) + continue; + + /* + * Try to look up this inode. If we can't get it, just move + * on because we haven't actually scrubbed the inobt or the + * inodes yet. + */ + error = xchk_iget(ragi->sc, + XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno, + agino), + &ip); + if (error) + continue; + + trace_xrep_iunlink_reload_ondisk(ip); + + if (VFS_I(ip)->i_nlink == 0) + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + xchk_irele(sc, ip); + if (error) + break; + } + + return error; +} + +/* + * Find ondisk inodes that are unlinked and not in cache, and mark them in + * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if + * the btree is corrupt. + */ +STATIC void +xrep_iunlink_mark_ondisk( + struct xrep_agi *ragi) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_btree_cur *cur; + int error; + + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); + error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi); + xfs_btree_del_cursor(cur, error); +} + +/* + * Walk an iunlink bucket's inode list. For each inode that should be on this + * chain, clear its entry in in iunlink_bmp because it's ok and we don't need + * to touch it further. + */ +STATIC int +xrep_iunlink_resolve_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino = ragi->iunlink_heads[bucket]; + int error = 0; + + while (next_agino != NULLAGINO) { + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + /* Find the next inode in the chain. */ + ip = xfs_iunlink_lookup(sc->sa.pag, next_agino); + if (!ip) { + /* Inode not incore? Terminate the chain. */ + trace_xrep_iunlink_resolve_uncached(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = NULLAGINO; + break; + } + + if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) { + /* + * Inode is in the wrong bucket. Advance the list, + * but pretend we didn't see this inode. + */ + trace_xrep_iunlink_resolve_wronglist(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = ip->i_next_unlinked; + continue; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + /* + * Incore inode doesn't think this inode is on an + * unlinked list. This is probably because we reloaded + * it from disk. Advance the list, but pretend we + * didn't see this inode; we'll fix that later. + */ + trace_xrep_iunlink_resolve_nolist(sc->sa.pag, + bucket, prev_agino, next_agino); + next_agino = ip->i_next_unlinked; + continue; + } + + trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino, + next_agino); + + /* + * Otherwise, this inode's unlinked pointers are ok. Clear it + * from the unlinked bitmap since we're done with it, and make + * sure the chain is still correct. + */ + error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1); + if (error) + return error; + + /* Remember the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, + next_agino); + if (error) + return error; + } + + /* Remember this inode's previous pointer. */ + error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino); + if (error) + return error; + + /* Advance the list and remember this inode. */ + prev_agino = next_agino; + next_agino = ip->i_next_unlinked; + } + + /* Update the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, next_agino); + if (error) + return error; + } + + return 0; +} + +/* Reinsert this unlinked inode into the head of the staged bucket list. */ +STATIC int +xrep_iunlink_add_to_bucket( + struct xrep_agi *ragi, + xfs_agino_t agino) +{ + xfs_agino_t current_head; + unsigned int bucket; + int error; + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + /* Point this inode at the current head of the bucket list. */ + current_head = ragi->iunlink_heads[bucket]; + + trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino, + current_head); + + error = xrep_iunlink_store_next(ragi, agino, current_head); + if (error) + return error; + + /* Remember the head inode's previous pointer. */ + if (current_head != NULLAGINO) { + error = xrep_iunlink_store_prev(ragi, current_head, agino); + if (error) + return error; + } + + ragi->iunlink_heads[bucket] = agino; + return 0; +} + +/* Reinsert unlinked inodes into the staged iunlink buckets. */ +STATIC int +xrep_iunlink_add_lost_inodes( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_agi *ragi = priv; + int error; + + for (; len > 0; start++, len--) { + error = xrep_iunlink_add_to_bucket(ragi, start); + if (error) + return error; + } + + return 0; +} + +/* + * Figure out the iunlink bucket values and find inodes that need to be + * reinserted into the list. + */ +STATIC int +xrep_iunlink_rebuild_buckets( + struct xrep_agi *ragi) +{ + unsigned int i; + int error; + + /* + * Walk the ondisk AGI unlinked list to find inodes that are on the + * list but aren't in memory. This can happen if a past log recovery + * tried to clear the iunlinked list but failed. Our scan rebuilds the + * unlinked list using incore inodes, so we must load and link them + * properly. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_walk_ondisk_bucket(ragi, i); + if (error) + return error; + } + + /* + * Record all the incore unlinked inodes in iunlink_bmp that we didn't + * find by walking the ondisk iunlink buckets. This shouldn't happen, + * but we can't risk forgetting an inode somewhere. + */ + error = xrep_iunlink_mark_incore(ragi); + if (error) + return error; + + /* + * If there are ondisk inodes that are unlinked and are not been loaded + * into cache, record them in iunlink_bmp. + */ + xrep_iunlink_mark_ondisk(ragi); + + /* + * Walk each iunlink bucket to (re)construct as much of the incore list + * as would be correct. For each inode that survives this step, mark + * it clear in iunlink_bmp; we're done with those inodes. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_resolve_bucket(ragi, i); + if (error) + return error; + } + + /* + * Any unlinked inodes that we didn't find through the bucket list + * walk (or was ignored by the walk) must be inserted into the bucket + * list. Stage this in memory for now. + */ + return xagino_bitmap_walk(&ragi->iunlink_bmp, + xrep_iunlink_add_lost_inodes, ragi); +} + +/* Update i_next_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_next( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t next_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_ino_t ino; + xfs_agino_t prev_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + error = xchk_iget(sc, ino, &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the backward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + + /* Update the forward pointer. */ + if (ip->i_next_unlinked != next_agino) { + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Update i_prev_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_prev( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t prev_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ASSERT(prev_agino != 0); + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_ino_t ino; + xfs_agino_t next_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + error = xchk_iget(sc, ino, &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the forward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &next_agino); + if (error) + goto out_rele; + + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + + /* Update the backward pointer. */ + if (ip->i_prev_unlinked != prev_agino) { + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Log all the iunlink updates we need to finish regenerating the AGI. */ +STATIC int +xrep_iunlink_commit( + struct xrep_agi *ragi) +{ + struct xfs_agi *agi = ragi->agi_bp->b_addr; + xfarray_idx_t idx = XFARRAY_CURSOR_INIT; + xfs_agino_t agino; + unsigned int i; + int error; + + /* Fix all the forward links */ + while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_next(ragi, idx, agino); + if (error) + return error; + } + + /* Fix all the back links */ + idx = XFARRAY_CURSOR_INIT; + while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_prev(ragi, idx, agino); + if (error) + return error; + } + + /* Copy the staged iunlink buckets to the new AGI. */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i, + be32_to_cpu(ragi->old_agi.agi_unlinked[i]), + ragi->iunlink_heads[i]); + + agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]); + } + + return 0; +} + /* Trigger reinitialization of the in-core data. */ STATIC int xrep_agi_commit_new( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_perag *pag; struct xfs_agi *agi = agi_bp->b_addr; @@ -962,39 +1706,64 @@ xrep_agi_commit_new( pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGI. */ int xrep_agi( - struct xfs_scrub *sc) + struct xfs_scrub *sc) { - struct xrep_find_ag_btree fab[XREP_AGI_MAX] = { - [XREP_AGI_INOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_FINOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_finobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_END] = { - .buf_ops = NULL - }, - }; - struct xfs_agi old_agi; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *agi_bp; - struct xfs_agi *agi; - int error; + struct xrep_agi *ragi; + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned int i; + int error; /* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; + sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + ragi = sc->buf; + ragi->sc = sc; + + ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_inobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_finobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){ + .buf_ops = NULL, + }; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ragi->iunlink_heads[i] = NULLAGINO; + + xagino_bitmap_init(&ragi->iunlink_bmp); + sc->buf_cleanup = xrep_agi_buf_cleanup; + + descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_next); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_prev); + kfree(descr); + if (error) + return error; + /* * Make sure we have the AGI buffer, as scrub might have decided it * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. @@ -1002,14 +1771,17 @@ xrep_agi( error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL); if (error) return error; - agi_bp->b_ops = &xfs_agi_buf_ops; - agi = agi_bp->b_addr; + ragi->agi_bp->b_ops = &xfs_agi_buf_ops; /* Find the AGI btree roots. */ - error = xrep_agi_find_btrees(sc, fab); + error = xrep_agi_find_btrees(ragi); + if (error) + return error; + + error = xrep_iunlink_rebuild_buckets(ragi); if (error) return error; @@ -1018,18 +1790,21 @@ xrep_agi( return error; /* Start rewriting the header and implant the btrees we found. */ - xrep_agi_init_header(sc, agi_bp, &old_agi); - xrep_agi_set_roots(sc, agi, fab); - error = xrep_agi_calc_from_btrees(sc, agi_bp); + xrep_agi_init_header(ragi); + xrep_agi_set_roots(ragi); + error = xrep_agi_calc_from_btrees(ragi); + if (error) + goto out_revert; + error = xrep_iunlink_commit(ragi); if (error) goto out_revert; /* Reinitialize in-core state. */ - return xrep_agi_commit_new(sc, agi_bp); + return xrep_agi_commit_new(ragi); out_revert: /* Mark the incore AGI state stale and revert the AGI. */ clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); - memcpy(agi, &old_agi, sizeof(old_agi)); + memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi)); return error; } diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h new file mode 100644 index 000000000000..56d7db5f1699 --- /dev/null +++ b/fs/xfs/scrub/agino_bitmap.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGINO_BITMAP_H__ +#define __XFS_SCRUB_AGINO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agino_t */ + +struct xagino_bitmap { + struct xbitmap32 aginobitmap; +}; + +static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->aginobitmap); +} + +static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->aginobitmap); +} + +static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_clear(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_set(&bitmap->aginobitmap, agino, len); +} + +static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int *len) +{ + return xbitmap32_test(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->aginobitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 279af72b1671..d1b8a4997dd2 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,13 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/repair.h" /* * Set us up to scrub free space btrees. @@ -24,10 +27,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + int error; + if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); - return xchk_setup_ag_btree(sc, false); + error = xchk_setup_ag_btree(sc, false); + if (error) + return error; + + if (xchk_could_repair(sc)) + return xrep_setup_ag_allocbt(sc); + + return 0; } /* Free space btree scrubber. */ @@ -127,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -138,31 +150,27 @@ xchk_allocbt_rec( return 0; } -/* Scrub the freespace btrees for some AG. */ -STATIC int +/* Scrub one of the freespace btrees for some AG. */ +int xchk_allocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; - cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); -} - -int -xchk_bnobt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_BNO); -} + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_BNOBT: + cur = sc->sa.bno_cur; + break; + case XFS_SCRUB_TYPE_CNTBT: + cur = sc->sa.cnt_cur; + break; + default: + ASSERT(0); + return -EIO; + } -int -xchk_cntbt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_CNT); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } /* xref check that the extent is not free */ diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..30295898cc8a --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,933 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recreate the free extent records in an AG by looking + * for gaps in the physical extents recorded in the rmapbt. These records are + * staged in @free_records. Identifying the gaps is more difficult on a + * reflink filesystem because rmap records are allowed to overlap. + * + * Because the final step of building a new index is to free the space used by + * the old index, repair needs to find that space. Unfortunately, all + * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share + * the same rmapbt owner code (OWN_AG), so this is not straightforward. + * + * The scan of the reverse mapping information records the space used by OWN_AG + * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While + * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to + * record all visited rmap btree blocks and all blocks owned by the AGFL. + * + * After that is where the definitions of old_allocbt_blocks shifts. This + * expression identifies possible former bnobt/cntbt blocks: + * + * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); + * + * Substituting from above definitions, that becomes: + * + * old_allocbt_blocks & ~not_allocbt_blocks + * + * The OWN_AG bitmap itself isn't needed after this point, so what we really do + * instead is: + * + * old_allocbt_blocks &= ~not_allocbt_blocks; + * + * After this point, @old_allocbt_blocks is a bitmap of alleged former + * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first + * parameter in place to avoid copying records around. + * + * Next, some of the space described by @free_records are diverted to the newbt + * reservation and used to format new btree blocks. The remaining records are + * written to the new btree indices. We reconstruct both bnobt and cntbt at + * the same time since we've already done all the work. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xagb_bitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xagb_bitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt. + */ + struct xrep_newbt new_bnobt; + + /* new cntbt information */ + struct xrep_newbt new_cntbt; + + /* Free space extents. */ + struct xfarray *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_agbno, we have found unused space. + */ + xfs_agblock_t next_agbno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* Set up to repair AG free space btrees. */ +int +xrep_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + unsigned int busy_gen; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); + if (xfs_extent_busy_list_empty(sc->sa.pag)) + return 0; + + return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); +} + +/* Check for any obvious conflicts in the free extent. */ +STATIC int +xrep_abt_check_free_ext( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->ar_startblock, rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sc->sa.refc_cur) { + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_COW, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_agbno, + .ar_blockcount = end - ra->next_agbno, + }; + struct xfs_scrub *sc = ra->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xrep_abt_check_free_ext(ra->sc, &arec); + if (error) + return error; + + trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + + error = xfarray_append(ra->free_records, &arec); + if (error) + return error; + + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(&ra->old_allocbt_blocks, + rec->rm_startblock, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_agbno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project + * next_agbno as far out into the AG space as we currently know about. + */ + ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); +} + +/* + * Compare two free space extents by block number. We want to sort in order of + * increasing block number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Re-sort the free extents by block number so that we can put the records into + * the bnobt in the correct order. Make sure the records do not overlap in + * physical space. + */ +STATIC int +xrep_bnobt_sort_records( + struct xrep_abt *ra) +{ + struct xfs_alloc_rec_incore arec; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); + if (error) + return error; + + while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { + if (arec.ar_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = arec.ar_startblock + arec.ar_blockcount; + } + + return error; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of increasing length and then in order of increasing + * block number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Sort the free extents by length so so that we can put the records into the + * cntbt in the correct order. Don't let userspace kill us if we're resorting + * after allocating btree blocks. + */ +STATIC int +xrep_cntbt_sort_records( + struct xrep_abt *ra, + bool is_resort) +{ + return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, + is_resort ? 0 : XFARRAY_SORT_KILLABLE); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&ra->not_allocbt_blocks); + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (ra->next_agbno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto err; + + error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err_agfl; + + /* Compute the old bnobt/cntbt blocks. */ + error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, + &ra->not_allocbt_blocks); + if (error) + goto err_agfl; + + ra->nr_real_records = xfarray_length(ra->free_records); +err_agfl: + xfs_trans_brelse(sc->tp, agfl_bp); +err: + xchk_ag_btcur_free(&sc->sa); + xagb_bitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *needs_resort) +{ + struct xfs_scrub *sc = ra->sc; + xfarray_idx_t record_nr; + unsigned int allocated = 0; + int error = 0; + + record_nr = xfarray_length(ra->free_records) - 1; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int len; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->new_cntbt.bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->new_bnobt.bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->new_bnobt.bload.nr_blocks + + ra->new_cntbt.bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfarray_load(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + len = min_t(unsigned int, arec.ar_blockcount, desired); + + trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, + arec.ar_startblock, len, XFS_RMAP_OWN_AG); + + error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, + arec.ar_startblock, len); + if (error) + break; + allocated += len; + ra->nr_blocks -= len; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record, inform the caller that the records are no + * longer sorted by length, and exit. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfarray_store(ra->free_records, record_nr, + &arec); + if (error) + break; + + *needs_resort = true; + return 0; + } + + /* + * We're going to use up the entire record, so unset it and + * move on to the next one. This changes the number of free + * records (but doesn't break the sorting order), so we must + * go around the loop once more to re-run _bload_init. + */ + error = xfarray_unset(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +STATIC int +xrep_abt_dispose_one( + struct xrep_abt *ra, + struct xrep_newbt_resv *resv) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t free_agbno = resv->agbno + resv->used; + xfs_extlen_t free_aglen = resv->len - resv->used; + int error; + + ASSERT(pag == resv->pag); + + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + resv->used, XFS_RMAP_OWN_AG); + + /* + * For each reserved btree block we didn't use, add it to the free + * space btree. We didn't touch fdblocks when we reserved them, so + * we don't touch it now. + */ + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, ra->new_bnobt.oinfo.oi_owner); + + error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, + &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); + if (error) + return error; + + return xrep_defer_finish(sc); +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + + if (error) + goto junkit; + + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + error = xrep_abt_dispose_one(ra, resv); + if (error) + goto junkit; + } + +junkit: + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + xfs_perag_put(resv->pag); + list_del(&resv->list); + kfree(resv); + } + + xrep_newbt_cancel(&ra->new_bnobt); + xrep_newbt_cancel(&ra->new_cntbt); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(ra->free_records, &ra->array_cur, + arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + unsigned int freesp_btreeblks = 0; + + /* + * Compute the contribution to agf_btreeblks for the new free space + * btrees. This is the computed btree size minus anything we didn't + * use. + */ + freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; + freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; + + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); + + /* + * The AGF header contains extra information related to the free space + * btrees, so we must update those fields here. + */ + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the allocbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_bno_level = pag->pagf_bno_level; + pag->pagf_repair_cnt_level = pag->pagf_cnt_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_perag *pag = sc->sa.pag; + bool needs_resort = false; + int error; + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xrep_cntbt_sort_records(ra, false); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt, sc); + xrep_newbt_init_bare(&ra->new_cntbt, sc); + + ra->new_bnobt.bload.get_records = xrep_abt_get_records; + ra->new_cntbt.bload.get_records = xrep_abt_get_records; + + ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; + ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); + + cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); + if (error) + goto err_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (needs_resort) { + error = xrep_cntbt_sort_records(ra, needs_resort); + if (error) + goto err_cur; + } + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the alternate incore btree + * height so that we don't trip the verifiers when writing the new + * btree blocks to disk. + */ + pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; + pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; + + /* Load the free space by length tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); + if (error) + goto err_levels; + + error = xrep_bnobt_sort_records(ra); + if (error) + goto err_levels; + + /* Load the free space by block number tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); + if (error) + goto err_levels; + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +err_levels: + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; +err_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +err_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + struct xfs_perag *pag = ra->sc->sa.pag; + int error; + + /* Free the old btree blocks if they're not in use. */ + error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); + if (error) + return error; + + /* + * Now that we've zapped all the old allocbt blocks we can turn off + * the alternate height mechanism. + */ + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; + return 0; +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. In theory we cleared this before we started, but + * let's not risk the filesystem. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + error = -EDEADLOCK; + goto out_ra; + } + + /* Set up enough storage to handle maximally fragmented free space. */ + descr = xchk_xfile_ag_descr(sc, "free space records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, + sizeof(struct xfs_alloc_rec_incore), + &ra->free_records); + kfree(descr); + if (error) + goto out_ra; + + /* Collect the free space data and find the old btree blocks. */ + xagb_bitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ra->old_allocbt_blocks); + xfarray_destroy(ra->free_records); +out_ra: + kfree(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_allocbt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_allocbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 147babe738d2..708334f9b2bd 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -10,16 +10,20 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_sf.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +#include "scrub/listxattr.h" +#include "scrub/repair.h" /* Free the buffers linked from the xattr buffer. */ static void @@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup( kvfree(ab->value); ab->value = NULL; ab->value_sz = 0; + kvfree(ab->name); + ab->name = NULL; } /* @@ -65,7 +71,7 @@ xchk_xattr_want_freemap( * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -static int +int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size) @@ -95,6 +101,12 @@ xchk_setup_xattr_buf( return -ENOMEM; } + if (xchk_could_repair(sc)) { + ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!ab->name) + return -ENOMEM; + } + resize_value: if (ab->value_sz >= value_size) return 0; @@ -121,6 +133,12 @@ xchk_setup_xattr( { int error; + if (xchk_could_repair(sc)) { + error = xrep_setup_xattr(sc); + if (error) + return error; + } + /* * We failed to get memory while checking attrs, so this time try to * get all the memory we're ever going to need. Allocate the buffer @@ -137,80 +155,85 @@ xchk_setup_xattr( /* Extended Attributes */ -struct xchk_xattr { - struct xfs_attr_list_context context; - struct xfs_scrub *sc; -}; - /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) - * to call this function for every attribute key in an inode. Once - * we're here, we load the attribute value to see if any errors happen, - * or if we get more or less data than we expected. + * We use the extended attribute walk helper to call this function for every + * attribute key in an inode. Once we're here, we load the attribute value to + * see if any errors happen, or if we get more or less data than we expected. */ -static void -xchk_xattr_listent( - struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) +static int +xchk_xattr_actor( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) { struct xfs_da_args args = { - .op_flags = XFS_DA_OP_NOTIME, - .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, - .geo = context->dp->i_mount->m_attr_geo, + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, .whichfork = XFS_ATTR_FORK, - .dp = context->dp, + .dp = ip, .name = name, .namelen = namelen, - .hashval = xfs_da_hashname(name, namelen), - .trans = context->tp, + .trans = sc->tp, .valuelen = valuelen, + .owner = ip->i_ino, }; struct xchk_xattr_buf *ab; - struct xchk_xattr *sx; int error = 0; - sx = container_of(context, struct xchk_xattr, context); - ab = sx->sc->buf; + ab = sc->buf; - if (xchk_should_terminate(sx->sc, &error)) { - context->seen_enough = error; - return; - } + if (xchk_should_terminate(sc, &error)) + return error; - if (flags & ~XFS_ATTR_ONDISK_MASK) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + if (attr_flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - if (flags & XFS_ATTR_INCOMPLETE) { + if (attr_flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ - xchk_ino_set_preen(sx->sc, context->dp->i_ino); - return; + xchk_ino_set_preen(sc, ip->i_ino); + return 0; } /* Does this name make sense? */ - if (!xfs_attr_namecheck(flags, name, namelen)) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + if (!xfs_attr_namecheck(attr_flags, name, namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; + } + + /* Check parent pointer record. */ + if ((attr_flags & XFS_ATTR_PARENT) && + !xfs_parent_valuecheck(sc->mp, value, valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } /* - * Try to allocate enough memory to extrat the attr value. If that - * doesn't work, we overload the seen_enough variable to convey - * the error message back to the main scrub function. + * Try to allocate enough memory to extract the attr value. If that + * doesn't work, return -EDEADLOCK as a signal to try again with a + * maximally sized buffer. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen); + error = xchk_setup_xattr_buf(sc, valuelen); if (error == -ENOMEM) error = -EDEADLOCK; - if (error) { - context->seen_enough = error; - return; - } + if (error) + return error; + + /* + * Parent pointers are matched on attr name and value, so we must + * supply the xfs_parent_rec here when confirming that the dabtree + * indexing works correctly. + */ + if (attr_flags & XFS_ATTR_PARENT) + memcpy(ab->value, value, valuelen); args.value = ab->value; @@ -219,20 +242,18 @@ xchk_xattr_listent( * through the dabtree indexing and that remote value retrieval also * works correctly. */ + xfs_attr_sethash(&args); error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ if (error == -ENODATA) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno, &error)) - goto fail_xref; + return error; if (args.valuelen != valuelen) - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, - args.blkno); -fail_xref: - if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = 1; - return; + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + + return 0; } /* @@ -242,7 +263,7 @@ fail_xref: * Within a char, the lowest bit of the char represents the byte with * the smallest address */ -STATIC bool +bool xchk_xattr_set_map( struct xfs_scrub *sc, unsigned long *map, @@ -399,6 +420,17 @@ xchk_xattr_block( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); hdrsize = xfs_attr3_leaf_hdr_size(leaf); + /* + * Empty xattr leaf blocks mapped at block 0 are probably a byproduct + * of a race between setxattr and a log shutdown. Anywhere else in the + * attr fork is a corruption. + */ + if (leafhdr.count == 0) { + if (blk->blkno == 0) + xchk_da_set_preen(ds, level); + else + xchk_da_set_corrupt(ds, level); + } if (leafhdr.usedbytes > mp->m_attr_geo->blksize) xchk_da_set_corrupt(ds, level); if (leafhdr.firstused > mp->m_attr_geo->blksize) @@ -407,6 +439,8 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); + if (leafhdr.holes) + xchk_da_set_preen(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; @@ -504,7 +538,10 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval, + lentry->namelen, + lentry->nameval + lentry->namelen, + be16_to_cpu(lentry->valuelen)); } else { rentry = (struct xfs_attr_leaf_name_remote *) (((char *)bp->b_addr) + nameidx); @@ -512,7 +549,13 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + if (ent->flags & XFS_ATTR_PARENT) { + xchk_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name, + rentry->namelen, NULL, + be32_to_cpu(rentry->valuelen)); } if (calc_hash != hash) xchk_da_set_corrupt(ds, level); @@ -527,28 +570,23 @@ xchk_xattr_check_sf( struct xfs_scrub *sc) { struct xchk_xattr_buf *ab = sc->buf; - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; + struct xfs_ifork *ifp = &sc->ip->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; + struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf); struct xfs_attr_sf_entry *next; - struct xfs_ifork *ifp; - unsigned char *end; + unsigned char *end = ifp->if_data + ifp->if_bytes; int i; int error = 0; - ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); - bitmap_zero(ab->usedmap, ifp->if_bytes); - sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; - end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; - xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf)); - sfe = &sf->list[0]; if ((unsigned char *)sfe > end) { xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); return 0; } - for (i = 0; i < sf->hdr.count; i++) { + for (i = 0; i < sf->count; i++) { unsigned char *name = sfe->nameval; unsigned char *value = &sfe->nameval[sfe->namelen]; @@ -602,16 +640,6 @@ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx = { - .sc = sc, - .context = { - .dp = sc->ip, - .tp = sc->tp, - .resynch = 1, - .put_listent = xchk_xattr_listent, - .allow_incomplete = true, - }, - }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -640,12 +668,6 @@ xchk_xattr( /* * Look up every xattr in this file by name and hash. * - * Use the backend implementation of xfs_attr_list to call - * xchk_xattr_listent on every attribute key in this inode. - * In other words, we use the same iterator/callback mechanism - * that listattr uses to scrub extended attributes, though in our - * _listent function, we check the value of the attribute. - * * The VFS only locks i_rwsem when modifying attrs, so keep all * three locks held because that's the only way to ensure we're * the only thread poking into the da btree. We traverse the da @@ -653,13 +675,9 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_ilocked(&sx.context); + error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) return error; - /* Did our listent function try to return any errors? */ - if (sx.context.seen_enough < 0) - return sx.context.seen_enough; - return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 48fd9402c432..7db58af56646 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -16,9 +16,16 @@ struct xchk_xattr_buf { /* Bitmap of free space in xattr leaf blocks. */ unsigned long *freemap; + /* Memory buffer used to hold salvaged xattr names. */ + unsigned char *name; + /* Memory buffer used to extract xattr values. */ void *value; size_t value_sz; }; +bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map, + unsigned int start, unsigned int len); +int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size); + #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c new file mode 100644 index 000000000000..c7eb94069caf --- /dev/null +++ b/fs/xfs/scrub/attr_repair.c @@ -0,0 +1,1663 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_acl.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr.h" +#include "scrub/reap.h" +#include "scrub/attr_repair.h" + +/* + * Extended Attribute Repair + * ========================= + * + * We repair extended attributes by reading the attr leaf blocks looking for + * attributes entries that look salvageable (name passes verifiers, value can + * be retrieved, etc). Each extended attribute worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * file to constrain memory use. Batching the construction of the temporary + * extended attribute structure in this fashion reduces lock cycling of the + * file being repaired and the temporary file. + * + * When salvaging completes, the remaining stashed attributes are replayed to + * the temporary file. An atomic file contents exchange is used to commit the + * new xattr blocks to the file being repaired. This will disrupt attrmulti + * cursors. + */ + +struct xrep_xattr_key { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_xattr { + struct xfs_scrub *sc; + + /* Information for exchanging attr fork mappings at the end. */ + struct xrep_tempexch tx; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Number of attributes that we are salvaging. */ + unsigned long long attrs_found; + + /* Can we flush stashed attrs to the tempfile? */ + bool can_flush; + + /* Did the live update fail, and hence the repair is now out of date? */ + bool live_update_aborted; + + /* Lock protecting parent pointer updates */ + struct mutex lock; + + /* Fixed-size array of xrep_xattr_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Hook to capture parent pointer updates. */ + struct xfs_dir_hook dhook; + + /* Scratch buffer for capturing parent pointers. */ + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Create a parent pointer in the tempfile. */ +#define XREP_XATTR_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_XATTR_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_xattr_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_XATTR_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* Set up to recreate the extended attributes. */ +int +xrep_setup_xattr( + struct xfs_scrub *sc) +{ + if (xfs_has_parent(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + return xrep_tempfile_create(sc, S_IFREG); +} + +/* + * Decide if we want to salvage this attribute. We don't bother with + * incomplete or oversized keys or values. The @value parameter can be null + * for remote attrs. + */ +STATIC int +xrep_xattr_want_salvage( + struct xrep_xattr *rx, + unsigned int attr_flags, + const void *name, + int namelen, + const void *value, + int valuelen) +{ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + if (namelen > XATTR_NAME_MAX || namelen <= 0) + return false; + if (!xfs_attr_namecheck(attr_flags, name, namelen)) + return false; + if (valuelen > XATTR_SIZE_MAX || valuelen < 0) + return false; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_valuecheck(rx->sc->mp, value, valuelen); + + return true; +} + +/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */ +STATIC int +xrep_xattr_salvage_key( + struct xrep_xattr *rx, + int flags, + unsigned char *name, + int namelen, + unsigned char *value, + int valuelen) +{ + struct xrep_xattr_key key = { + .valuelen = valuelen, + .flags = flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this attribute. + */ + if (flags & XFS_ATTR_PARENT) { + key.namelen = namelen; + + trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name, + key.namelen, value, valuelen); + } else { + while (i < namelen && name[i] != 0) + i++; + if (i == 0) + return 0; + key.namelen = i; + + trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name, + key.namelen, valuelen); + } + + error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + error = xfarray_append(rx->xattr_records, &key); + if (error) + return error; + + rx->attrs_found++; + return 0; +} + +/* + * Record a shortform extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_sf_attr( + struct xrep_xattr *rx, + struct xfs_attr_sf_hdr *hdr, + struct xfs_attr_sf_entry *sfe) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr, + sfe->namelen)) + return 0; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr, + sfe->valuelen)) + return 0; + + if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen)) + return 0; + + return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen); +} + +/* + * Record a local format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_local_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_local *lentry) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + unsigned char *value; + unsigned int valuelen; + unsigned int namesize; + + /* + * Decode the leaf local entry format. If something seems wrong, we + * junk the attribute. + */ + value = &lentry->nameval[lentry->namelen]; + valuelen = be16_to_cpu(lentry->valuelen); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen); + if ((char *)lentry + namesize > buf_end) + return 0; + if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* Try to save this attribute. */ + return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen); +} + +/* + * Record a remote format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_remote_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_remote *rentry, + unsigned int ent_idx, + struct xfs_buf *leaf_bp) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + struct xfs_da_args args = { + .trans = rx->sc->tp, + .dp = rx->sc->ip, + .index = ent_idx, + .geo = rx->sc->mp->m_attr_geo, + .owner = rx->sc->ip->i_ino, + .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK, + .namelen = rentry->namelen, + .name = rentry->name, + .value = ab->value, + .valuelen = be32_to_cpu(rentry->valuelen), + }; + unsigned int namesize; + int error; + + /* + * Decode the leaf remote entry format. If something seems wrong, we + * junk the attribute. Note that we should never find a zero-length + * remote attribute value. + */ + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + if ((char *)rentry + namesize > buf_end) + return 0; + if (args.valuelen == 0 || + !xrep_xattr_want_salvage(rx, ent->flags, rentry->name, + rentry->namelen, NULL, args.valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* + * Enlarge the buffer (if needed) to hold the value that we're trying + * to salvage from the old extended attribute data. + */ + error = xchk_setup_xattr_buf(rx->sc, args.valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; + + /* Look up the remote value and stash it for reconstruction. */ + error = xfs_attr3_leaf_getvalue(leaf_bp, &args); + if (error || args.rmtblkno == 0) + goto err_free; + + error = xfs_attr_rmtval_get(&args); + if (error) + goto err_free; + + /* Try to save this attribute. */ + error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name, + rentry->namelen, ab->value, args.valuelen); +err_free: + /* remote value was garbage, junk it */ + if (error == -EFSBADCRC || error == -EFSCORRUPTED) + error = 0; + return error; +} + +/* Extract every xattr key that we can from this attr fork block. */ +STATIC int +xrep_xattr_recover_leaf( + struct xrep_xattr *rx, + struct xfs_buf *bp) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_scrub *sc = rx->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + struct xchk_xattr_buf *ab = rx->sc->buf; + char *buf_end; + size_t off; + unsigned int nameidx; + unsigned int hdrsize; + int i; + int error = 0; + + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); + + /* Check the leaf header */ + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize); + entries = xfs_attr3_leaf_entryp(leaf); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* Skip key if it conflicts with something else? */ + off = (char *)ent - (char *)leaf; + if (!xchk_xattr_set_map(sc, ab->usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) + continue; + + /* Check the name information. */ + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr.firstused || + nameidx >= mp->m_attr_geo->blksize) + continue; + + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, i); + error = xrep_xattr_salvage_local_attr(rx, ent, nameidx, + buf_end, lentry); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, i); + error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx, + buf_end, rentry, i, bp); + } + if (error) + return error; + } + + return 0; +} + +/* Try to recover shortform attrs. */ +STATIC int +xrep_xattr_recover_sf( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_sf_hdr *hdr; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK); + hdr = ifp->if_data; + + bitmap_zero(ab->usedmap, ifp->if_bytes); + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr)); + + sfe = xfs_attr_sf_firstentry(hdr); + if ((unsigned char *)sfe > end) + return 0; + + for (i = 0; i < hdr->count; i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) + break; + + if (xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)hdr, + sizeof(struct xfs_attr_sf_entry))) { + /* + * No conflicts with the sf entry; let's save this + * attribute. + */ + error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe); + if (error) + return error; + } + + sfe = next; + } + + return 0; +} + +/* + * Try to return a buffer of xattr data for a given physical extent. + * + * Because the buffer cache get function complains if it finds a buffer + * matching the block number but not matching the length, we must be careful to + * look for incore buffers (up to the maximum length of a remote value) that + * could be hiding anywhere in the physical range. If we find an incore + * buffer, we can pass that to the caller. Optionally, read a single block and + * pass that back. + * + * Note the subtlety that remote attr value blocks for which there is no incore + * buffer will be passed to the callback one block at a time. These buffers + * will not have any ops attached and must be staled to prevent aliasing with + * multiblock buffers once we drop the ILOCK. + */ +STATIC int +xrep_xattr_find_buf( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + bool can_read, + struct xfs_buf **bpp) +{ + struct xrep_bufscan scan = { + .daddr = XFS_FSB_TO_DADDR(mp, fsbno), + .max_sectors = xrep_bufscan_max_sectors(mp, max_len), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + *bpp = bp; + return 0; + } + + if (!can_read) { + *bpp = NULL; + return 0; + } + + return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1), + XBF_TRYLOCK, bpp, NULL); +} + +/* + * Deal with a buffer that we found during our walk of the attr fork. + * + * Attribute leaf and node blocks are simple -- they're a single block, so we + * can walk them one at a time and we never have to worry about discontiguous + * multiblock buffers like we do for directories. + * + * Unfortunately, remote attr blocks add a lot of complexity here. Each disk + * block is totally self contained, in the sense that the v5 header provides no + * indication that there could be more data in the next block. The incore + * buffers can span multiple blocks, though they never cross extent records. + * However, they don't necessarily start or end on an extent record boundary. + * Therefore, we need a special buffer find function to walk the buffer cache + * for us. + * + * The caller must hold the ILOCK on the file being repaired. We use + * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't + * own the block and don't want to hang the system on a potentially garbage + * buffer. + */ +STATIC int +xrep_xattr_recover_block( + struct xrep_xattr *rx, + xfs_dablk_t dabno, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + xfs_extlen_t *actual_len) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + int error; + + error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp); + if (error) + return error; + info = bp->b_addr; + *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length); + + trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno, + be16_to_cpu(info->magic)); + + /* + * If the buffer has the right magic number for an attr leaf block and + * passes a structure check (we don't care about checksums), salvage + * as much as we can from the block. */ + if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) && + xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) && + xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL) + error = xrep_xattr_recover_leaf(rx, bp); + + /* + * If the buffer didn't already have buffer ops set, it was read in by + * the _find_buf function and could very well be /part/ of a multiblock + * remote block. Mark it stale so that it doesn't hang around in + * memory to cause problems. + */ + if (bp->b_ops == NULL) + xfs_buf_stale(bp); + + xfs_buf_relse(bp); + return error; +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_xattr_insert_rec( + struct xrep_xattr *rx, + const struct xrep_xattr_key *key) +{ + struct xfs_da_args args = { + .dp = rx->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rx->sc->ip->i_ino, + .geo = rx->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + struct xchk_xattr_buf *ab = rx->sc->buf; + int error; + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = ab->name; + args.value = ab->value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + ab->name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->value_cookie); + if (error) + return error; + + ab->name[key->namelen] = 0; + + if (key->flags & XFS_ATTR_PARENT) { + trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags, + ab->name, key->namelen, ab->value, + key->valuelen); + args.op_flags |= XFS_DA_OP_LOGGED; + } else { + trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags, + ab->name, key->namelen, key->valuelen); + } + + /* + * xfs_attr_set creates and commits its own transaction. If the attr + * already exists, we'll just drop it during the rebuild. + */ + xfs_attr_sethash(&args); + error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false); + if (error == -EEXIST) + error = 0; + + return error; +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_xattr_flush_stashed( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during xattr salvaging to avoid livelocking if there + * are cycles in the xattr structures. We hold ILOCK_EXCL on both + * the inode being repaired, though it is not ijoined to the scrub + * transaction. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the two ILOCKs so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from modifying the damaged xattr data while we + * repair it. + */ + error = xrep_trans_commit(rx->sc); + if (error) + return error; + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rx->xattr_records, array_cur) { + struct xrep_xattr_key key; + + error = xfarray_load(rx->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_xattr_insert_rec(rx, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->xattr_records); + xfblob_truncate(rx->xattr_blobs); + + xrep_tempfile_iounlock(rx->sc); + + /* Recreate the salvage transaction and relock the inode. */ + error = xchk_trans_alloc(rx->sc, 0); + if (error) + return error; + xchk_ilock(rx->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_xattr_want_flush_stashed( + struct xrep_xattr *rx) +{ + unsigned long long bytes; + + if (!rx->can_flush) + return false; + + bytes = xfarray_bytes(rx->xattr_records) + + xfblob_bytes(rx->xattr_blobs); + return bytes > XREP_XATTR_MAX_STASH_BYTES; +} + +/* + * Did we observe rename changing parent pointer xattrs while we were flushing + * salvaged attrs? + */ +static inline bool +xrep_xattr_saw_pptr_conflict( + struct xrep_xattr *rx) +{ + bool ret; + + ASSERT(rx->can_flush); + + if (!xfs_has_parent(rx->sc->mp)) + return false; + + xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL); + + mutex_lock(&rx->lock); + ret = xfarray_bytes(rx->pptr_recs) > 0; + mutex_unlock(&rx->lock); + + return ret; +} + +/* + * Reset the entire repair state back to initial conditions, now that we've + * detected a parent pointer update to the attr structure while we were + * flushing salvaged attrs. See the locking notes in dir_repair.c for more + * information on why this is all necessary. + */ +STATIC int +xrep_xattr_full_reset( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = &sc->tempip->i_af; + int error; + + trace_xrep_xattr_full_reset(sc->ip, sc->tempip); + + /* The temporary file's data fork had better not be in btree format. */ + if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ASSERT(0); + return -EIO; + } + + /* + * We begin in transaction context with sc->ip ILOCKed but not joined + * to the transaction. To reset to the initial state, we must hold + * sc->ip's ILOCK to prevent rename from updating parent pointer + * information and the tempfile's ILOCK to clear its contents. + */ + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock_both(sc); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Free all the blocks of the attr fork of the temp file, and reset + * it back to local format. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + + ASSERT(ifp->if_bytes == 0); + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK); + } + + /* Reinitialize the attr fork to an empty shortform structure. */ + hdr = ifp->if_data; + memset(hdr, 0, sizeof(*hdr)); + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + /* + * Roll this transaction to commit our reset ondisk. The tempfile + * should no longer be joined to the transaction, so we drop its ILOCK. + * This should leave us in transaction context with sc->ip ILOCKed but + * not joined to the transaction. + */ + error = xrep_roll_trans(sc); + if (error) + return error; + xrep_tempfile_iunlock(sc); + + /* + * Erase any accumulated parent pointer updates now that we've erased + * the tempfile's attr fork. We're resetting the entire repair state + * back to where we were initially, except now we won't flush salvaged + * xattrs until the very end. + */ + mutex_lock(&rx->lock); + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + + rx->can_flush = false; + rx->attrs_found = 0; + + ASSERT(xfarray_bytes(rx->xattr_records) == 0); + ASSERT(xfblob_bytes(rx->xattr_blobs) == 0); + return 0; +} + +/* Extract as many attribute keys and values as we can. */ +STATIC int +xrep_xattr_recover( + struct xrep_xattr *rx) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rx->sc; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + xfs_fileoff_t offset; + xfs_extlen_t len; + xfs_dablk_t dabno; + int nmap; + int error; + +restart: + /* + * Iterate each xattr leaf block in the attr fork to scan them for any + * attributes that we might salvage. + */ + for (offset = 0; + offset < XFS_MAX_FILEOFF; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset, + &got, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += len) { + xfs_fileoff_t curr_offset = dabno - got.br_startoff; + xfs_extlen_t maxlen; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + maxlen = min_t(xfs_filblks_t, INT_MAX, + got.br_blockcount - curr_offset); + error = xrep_xattr_recover_block(rx, dabno, + curr_offset + got.br_startblock, + maxlen, &len); + if (error) + return error; + + if (xrep_xattr_want_flush_stashed(rx)) { + error = xrep_xattr_flush_stashed(rx); + if (error) + return error; + + if (xrep_xattr_saw_pptr_conflict(rx)) { + error = xrep_xattr_full_reset(rx); + if (error) + return error; + + goto restart; + } + } + } + } + + return 0; +} + +/* + * Reset the extended attribute fork to a state where we can start re-adding + * the salvaged attributes. + */ +STATIC int +xrep_xattr_fork_remove( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); + + /* + * If the data fork is in btree format, we can't change di_forkoff + * because we could run afoul of the rule that the data fork isn't + * supposed to be in btree format if there's enough space in the fork + * that it could have used extents format. Instead, reinitialize the + * attr fork to have a shortform structure with zero attributes. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ifp->if_format = XFS_DINODE_FMT_LOCAL; + hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes, + XFS_ATTR_FORK); + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + return 0; + } + + /* If we still have attr fork extents, something's wrong. */ + if (ifp->if_nextents != 0) { + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + unsigned int i = 0; + + xfs_emerg(sc->mp, + "inode 0x%llx attr fork still has %llu attr extents, format %d?!", + ip->i_ino, ifp->if_nextents, ifp->if_format); + for_each_xfs_iext(ifp, &icur, &irec) { + xfs_err(sc->mp, + "[%u]: startoff %llu startblock %llu blockcount %llu state %u", + i++, irec.br_startoff, + irec.br_startblock, irec.br_blockcount, + irec.br_state); + } + ASSERT(0); + return -EFSCORRUPTED; + } + + xfs_attr_fork_remove(ip, sc->tp); + return 0; +} + +/* + * Free all the attribute fork blocks of the file being repaired and delete the + * fork. The caller must ILOCK the scrub file and join it to the transaction. + * This function returns with the inode joined to a clean transaction. + */ +int +xrep_xattr_reset_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->ip); + + /* Unmap all the attr blocks. */ + if (xfs_ifork_has_extents(&sc->ip->i_af)) { + error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK); + if (error) + return error; + } + + error = xrep_xattr_fork_remove(sc, sc->ip); + if (error) + return error; + + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + +/* + * Free all the attribute fork blocks of the temporary file and delete the attr + * fork. The caller must ILOCK the tempfile and join it to the transaction. + * This function returns with the inode joined to a clean scrub transaction. + */ +int +xrep_xattr_reset_tempfile_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->tempip); + + /* + * Wipe out the attr fork of the temp file so that regular inode + * inactivation won't trip over the corrupt attr fork. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + } + + return xrep_xattr_fork_remove(sc, sc->tempip); +} + +/* + * Find all the extended attributes for this inode by scraping them out of the + * attribute key blocks by hand, and flushing them into the temp file. + * When we're done, free the staging memory before exchanging the xattr + * structures to reduce memory usage. + */ +STATIC int +xrep_xattr_salvage_attributes( + struct xrep_xattr *rx) +{ + struct xfs_inode *ip = rx->sc->ip; + int error; + + /* Short format xattrs are easy! */ + if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_xattr_recover_sf(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); + } + + /* + * For non-inline xattr structures, the salvage function scans the + * buffer cache looking for potential attr leaf blocks. The scan + * requires the ability to lock any buffer found and runs independently + * of any transaction <-> buffer item <-> buffer linkage. Therefore, + * roll the transaction to ensure there are no buffers joined. We hold + * the ILOCK independently of the transaction. + */ + error = xfs_trans_roll(&rx->sc->tp); + if (error) + return error; + + error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + error = xrep_xattr_recover(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); +} + +/* + * Add this stashed incore parent pointer to the temporary file. The caller + * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in + * transaction context. + */ +STATIC int +xrep_xattr_replay_pptr_update( + struct xrep_xattr *rx, + const struct xfs_name *xname, + struct xrep_xattr_pptr *pptr) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + switch (pptr->action) { + case XREP_XATTR_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_xattr_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -EEXIST); + return error; + case XREP_XATTR_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_xattr_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -ENOATTR); + return error; + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the xattr rebuild, since + * files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_xattr_replay_pptr_updates( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rx->lock); + foreach_xfarray_idx(rx->pptr_recs, array_cur) { + struct xrep_xattr_pptr pptr; + + error = xfarray_load(rx->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rx->pptr_names, pptr.name_cookie, + &rx->xname, pptr.namelen); + if (error) + goto out_unlock; + mutex_unlock(&rx->lock); + + error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr); + if (error) + return error; + + mutex_lock(&rx->lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + return 0; +out_unlock: + mutex_unlock(&rx->lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentadd( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentremove( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Capture dirent updates being made by other threads. We will have to replay + * the parent pointer updates before exchanging attr forks. + */ +STATIC int +xrep_xattr_live_dirent_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_xattr *rx; + struct xfs_scrub *sc; + int error; + + rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb); + sc = rx->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino != sc->ip->i_ino) + return NOTIFY_DONE; + + mutex_lock(&rx->lock); + if (p->delta > 0) + error = xrep_xattr_stash_parentadd(rx, p->name, p->dp); + else + error = xrep_xattr_stash_parentremove(rx, p->name, p->dp); + if (error) + rx->live_update_aborted = true; + mutex_unlock(&rx->lock); + return NOTIFY_DONE; +} + +/* + * Prepare both inodes' attribute forks for an exchange. Promote the tempfile + * from short format to leaf format, and if the file being repaired has a short + * format attr fork, turn it into an empty extent list. + */ +STATIC int +xrep_xattr_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's attributes are in shortform format, convert that + * to a single leaf extent so that we can use the atomic mapping + * exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_attr_shortform_to_leaf(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform attribute fork, convert + * that to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + return 0; +} + +/* Exchange the temporary file's attribute fork with the one being repaired. */ +int +xrep_xattr_swap( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + bool ip_local, temp_local; + int error = 0; + + ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format attr fork and the rebuilt + * xattr data would fit in the repaired file's attr fork, just copy + * the contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local) { + int forkoff; + int newsize; + + newsize = xfs_attr_sf_totsize(sc->tempip); + forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize); + if (forkoff > 0) { + sc->ip->i_forkoff = forkoff; + xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK); + return 0; + } + } + + /* Otherwise, make sure both attr forks are in block-mapping mode. */ + error = xrep_xattr_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new extended attribute structure. + */ +STATIC int +xrep_xattr_finalize_tempfile( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_xattr_replay_pptr_updates(rx); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + if (error) + return error; + + if (xfarray_length(rx->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Exchange the new extended attribute data (which we created in the tempfile) + * with the file being repaired. + */ +STATIC int +xrep_xattr_rebuild_tree( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + /* + * If we didn't find any attributes to salvage, repair the file by + * zapping its attr fork. + */ + if (rx->attrs_found == 0) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + + goto forget_acls; + } + + trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip); + + /* + * Commit the repair transaction and drop the ILOCKs so that we can use + * the atomic file content exchange helper functions to compute the + * correct resource reservations. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr + * modifications, but there's nothing to prevent userspace from reading + * the attributes until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK on the temporary file so that we can run xattr + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed parent pointer updates to the temp file. After this + * point, we're ready to exchange attr fork mappings. + */ + error = xrep_xattr_finalize_tempfile(rx); + if (error) + return error; + + /* + * Exchange the blocks mapped by the tempfile's attr fork with the file + * being repaired. The old attr blocks will then be attached to the + * tempfile, so reap its attr fork. + */ + error = xrep_xattr_swap(sc, &rx->tx); + if (error) + return error; + + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + +forget_acls: + /* Invalidate cached ACLs now that we've reloaded all the xattrs. */ + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE); + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT); + return 0; +} + +/* Tear down all the incore scan stuff we created. */ +STATIC void +xrep_xattr_teardown( + struct xrep_xattr *rx) +{ + if (xfs_has_parent(rx->sc->mp)) + xfs_dir_hook_del(rx->sc->mp, &rx->dhook); + if (rx->pptr_names) + xfblob_destroy(rx->pptr_names); + if (rx->pptr_recs) + xfarray_destroy(rx->pptr_recs); + xfblob_destroy(rx->xattr_blobs); + xfarray_destroy(rx->xattr_records); + mutex_destroy(&rx->lock); + kfree(rx); +} + +/* Set up the filesystem scan so we can regenerate extended attributes. */ +STATIC int +xrep_xattr_setup_scan( + struct xfs_scrub *sc, + struct xrep_xattr **rxp) +{ + struct xrep_xattr *rx; + char *descr; + int max_len; + int error; + + rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS); + if (!rx) + return -ENOMEM; + rx->sc = sc; + rx->can_flush = true; + rx->xname.name = rx->namebuf; + + mutex_init(&rx->lock); + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values. + */ + max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize); + error = xchk_setup_xattr_buf(rx->sc, max_len); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + goto out_rx; + + /* Set up some staging for salvaged attribute keys and values */ + descr = xchk_xfile_ino_descr(sc, "xattr keys"); + error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key), + &rx->xattr_records); + kfree(descr); + if (error) + goto out_rx; + + descr = xchk_xfile_ino_descr(sc, "xattr names"); + error = xfblob_create(descr, &rx->xattr_blobs); + kfree(descr); + if (error) + goto out_keys; + + if (xfs_has_parent(sc->mp)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer entries"); + error = xfarray_create(descr, 0, + sizeof(struct xrep_xattr_pptr), + &rx->pptr_recs); + kfree(descr); + if (error) + goto out_values; + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer names"); + error = xfblob_create(descr, &rx->pptr_names); + kfree(descr); + if (error) + goto out_pprecs; + + xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update); + error = xfs_dir_hook_add(sc->mp, &rx->dhook); + if (error) + goto out_ppnames; + } + + *rxp = rx; + return 0; +out_ppnames: + xfblob_destroy(rx->pptr_names); +out_pprecs: + xfarray_destroy(rx->pptr_recs); +out_values: + xfblob_destroy(rx->xattr_blobs); +out_keys: + xfarray_destroy(rx->xattr_records); +out_rx: + mutex_destroy(&rx->lock); + kfree(rx); + return error; +} + +/* + * Repair the extended attribute metadata. + * + * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer. + * The buffer cache in XFS can't handle aliased multiblock buffers, so this + * might misbehave if the attr fork is crosslinked with other filesystem + * metadata. + */ +int +xrep_xattr( + struct xfs_scrub *sc) +{ + struct xrep_xattr *rx = NULL; + int error; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + /* The rmapbt is required to reap the old attr fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_xattr_setup_scan(sc, &rx); + if (error) + return error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_xattr_salvage_attributes(rx); + if (error) + goto out_scan; + + if (rx->live_update_aborted) { + error = -EIO; + goto out_scan; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_scan; + + error = xrep_xattr_rebuild_tree(rx); + if (error) + goto out_scan; + +out_scan: + xrep_xattr_teardown(rx); + return error; +} diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h new file mode 100644 index 000000000000..979729bd4a5f --- /dev/null +++ b/fs/xfs/scrub/attr_repair.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ATTR_REPAIR_H__ +#define __XFS_SCRUB_ATTR_REPAIR_H__ + +struct xrep_tempexch; + +int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx); +int xrep_xattr_reset_fork(struct xfs_scrub *sc); +int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index e0c89a9a0ca0..7ba35a7a7920 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -16,7 +16,9 @@ #include <linux/interval_tree_generic.h> -struct xbitmap_node { +/* u64 bitmap */ + +struct xbitmap64_node { struct rb_node bn_rbnode; /* First set bit of this interval and subtree. */ @@ -38,73 +40,74 @@ struct xbitmap_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void -xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); +static inline __maybe_unused void +xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline void -xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); +static inline __maybe_unused void +xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap_node * -xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, +static inline __maybe_unused struct xbitmap64_node * +xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap_node * -xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, +static inline __maybe_unused struct xbitmap64_node * +xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); -INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap_tree) +INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ -#define for_each_xbitmap_extent(bn, bitmap) \ +#define for_each_xbitmap64_extent(bn, bitmap) \ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ - struct xbitmap_node, bn_rbnode); \ + struct xbitmap64_node, bn_rbnode); \ (bn) != NULL; \ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ - struct xbitmap_node, bn_rbnode)) + struct xbitmap64_node, bn_rbnode)) /* Clear a range of this bitmap. */ int -xbitmap_clear( - struct xbitmap *bitmap, +xbitmap64_clear( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *bn; - struct xbitmap_node *new_bn; + struct xbitmap64_node *bn; + struct xbitmap64_node *new_bn; uint64_t last = start + len - 1; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) { if (bn->bn_start < start && bn->bn_last > last) { uint64_t old_last = bn->bn_last; /* overlaps with the entire clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); /* add an extent */ - new_bn = kmalloc(sizeof(struct xbitmap_node), + new_bn = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!new_bn) return -ENOMEM; new_bn->bn_start = last + 1; new_bn->bn_last = old_last; - xbitmap_tree_insert(new_bn, &bitmap->xb_root); + xbitmap64_tree_insert(new_bn, &bitmap->xb_root); } else if (bn->bn_start < start) { /* overlaps with the left side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); } else if (bn->bn_last > last) { /* overlaps with the right side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_start = last + 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); break; } else { /* in the middle of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } @@ -114,59 +117,59 @@ xbitmap_clear( /* Set a range of this bitmap. */ int -xbitmap_set( - struct xbitmap *bitmap, +xbitmap64_set( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *left; - struct xbitmap_node *right; + struct xbitmap64_node *left; + struct xbitmap64_node *right; uint64_t last = start + len - 1; int error; /* Is this whole range already set? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); if (left && left->bn_start <= start && left->bn_last >= last) return 0; /* Clear out everything in the range we want to set. */ - error = xbitmap_clear(bitmap, start, len); + error = xbitmap64_clear(bitmap, start, len); if (error) return error; /* Do we have a left-adjacent extent? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); ASSERT(!left || left->bn_last + 1 == start); /* Do we have a right-adjacent extent? */ - right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); ASSERT(!right || right->bn_start == last + 1); if (left && right) { /* combine left and right adjacent extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); left->bn_last = right->bn_last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); kfree(right); } else if (left) { /* combine with left extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } else if (right) { /* combine with right extent */ - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); right->bn_start = start; - xbitmap_tree_insert(right, &bitmap->xb_root); + xbitmap64_tree_insert(right, &bitmap->xb_root); } else { /* add an extent */ - left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!left) return -ENOMEM; left->bn_start = start; left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } return 0; @@ -174,21 +177,21 @@ xbitmap_set( /* Free everything related to this bitmap. */ void -xbitmap_destroy( - struct xbitmap *bitmap) +xbitmap64_destroy( + struct xbitmap64 *bitmap) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { - xbitmap_tree_remove(bn, &bitmap->xb_root); + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } /* Set up a per-AG block bitmap. */ void -xbitmap_init( - struct xbitmap *bitmap) +xbitmap64_init( + struct xbitmap64 *bitmap) { bitmap->xb_root = RB_ROOT_CACHED; } @@ -208,18 +211,18 @@ xbitmap_init( * This is the logical equivalent of bitmap &= ~sub. */ int -xbitmap_disunion( - struct xbitmap *bitmap, - struct xbitmap *sub) +xbitmap64_disunion( + struct xbitmap64 *bitmap, + struct xbitmap64 *sub) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; int error; - if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub)) return 0; - for_each_xbitmap_extent(bn, sub) { - error = xbitmap_clear(bitmap, bn->bn_start, + for_each_xbitmap64_extent(bn, sub) { + error = xbitmap64_clear(bitmap, bn->bn_start, bn->bn_last - bn->bn_start + 1); if (error) return error; @@ -228,88 +231,274 @@ xbitmap_disunion( return 0; } +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap64_hweight( + struct xbitmap64 *bitmap) +{ + struct xbitmap64_node *bn; + uint64_t ret = 0; + + for_each_xbitmap64_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap64_walk( + struct xbitmap64 *bitmap, + xbitmap64_walk_fn fn, + void *priv) +{ + struct xbitmap64_node *bn; + int error = 0; + + for_each_xbitmap64_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap64_empty( + struct xbitmap64 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap64_test( + struct xbitmap64 *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap64_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* u32 bitmap */ + +struct xbitmap32_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint32_t bn_start; + + /* Last set bit of this interval. */ + uint32_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint32_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint32_t parameters. */ + /* - * Record all btree blocks seen while iterating all records of a btree. - * - * We know that the btree query_all function starts at the left edge and walks - * towards the right edge of the tree. Therefore, we know that we can walk up - * the btree cursor towards the root; if the pointer for a given level points - * to the first record/key in that block, we haven't seen this block before; - * and therefore we need to remember that we saw this block in the btree. - * - * So if our btree is: - * - * 4 - * / | \ - * 1 2 3 - * - * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we - * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so - * we record block 4. The list is [1, 4]. - * - * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit - * the loop. The list remains [1, 4]. - * - * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that - * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. - * - * For the 102nd record, bc_levels[0].ptr == 2, so we continue. - * - * For the 201st record, we've moved on to leaf block 3. - * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. - * - * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline __maybe_unused void +xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -/* Mark a btree block to the agblock bitmap. */ -STATIC int -xagb_bitmap_visit_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) +static inline __maybe_unused void +xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline __maybe_unused struct xbitmap32_node * +xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, + uint32_t last); + +static inline __maybe_unused struct xbitmap32_node * +xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, + uint32_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap32_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap32_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap32_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap32_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap32_clear( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - struct xagb_bitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; + struct xbitmap32_node *bn; + struct xbitmap32_node *new_bn; + uint32_t last = start + len - 1; - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint32_t old_last = bn->bn_last; - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + /* overlaps with the entire clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap32_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap32_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } - return xagb_bitmap_set(bitmap, agbno, 1); + return 0; } -/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +/* Set a range of this bitmap. */ int -xagb_bitmap_set_btblocks( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_set( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, - XFS_BTREE_VISIT_ALL, bitmap); + struct xbitmap32_node *left; + struct xbitmap32_node *right; + uint32_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap32_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + xbitmap32_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap32_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap32_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap32_destroy( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) { + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap32_init( + struct xbitmap32 *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; } /* - * Record all the buffers pointed to by the btree cursor. Callers already - * engaged in a btree walk should call this function to capture the list of - * blocks going from the leaf towards the root. + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. */ int -xagb_bitmap_set_btcur_path( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_disunion( + struct xbitmap32 *bitmap, + struct xbitmap32 *sub) { - int i; + struct xbitmap32_node *bn; int error; - for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub)) + return 0; + + for_each_xbitmap32_extent(bn, sub) { + error = xbitmap32_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); if (error) return error; } @@ -318,14 +507,14 @@ xagb_bitmap_set_btcur_path( } /* How many bits are set in this bitmap? */ -uint64_t -xbitmap_hweight( - struct xbitmap *bitmap) +uint32_t +xbitmap32_hweight( + struct xbitmap32 *bitmap) { - struct xbitmap_node *bn; - uint64_t ret = 0; + struct xbitmap32_node *bn; + uint32_t ret = 0; - for_each_xbitmap_extent(bn, bitmap) + for_each_xbitmap32_extent(bn, bitmap) ret += bn->bn_last - bn->bn_start + 1; return ret; @@ -333,15 +522,15 @@ xbitmap_hweight( /* Call a function for every run of set bits in this bitmap. */ int -xbitmap_walk( - struct xbitmap *bitmap, - xbitmap_walk_fn fn, +xbitmap32_walk( + struct xbitmap32 *bitmap, + xbitmap32_walk_fn fn, void *priv) { - struct xbitmap_node *bn; + struct xbitmap32_node *bn; int error = 0; - for_each_xbitmap_extent(bn, bitmap) { + for_each_xbitmap32_extent(bn, bitmap) { error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); if (error) break; @@ -352,23 +541,23 @@ xbitmap_walk( /* Does this bitmap have no bits set at all? */ bool -xbitmap_empty( - struct xbitmap *bitmap) +xbitmap32_empty( + struct xbitmap32 *bitmap) { return bitmap->xb_root.rb_root.rb_node == NULL; } /* Is the start of the range set or clear? And for how long? */ bool -xbitmap_test( - struct xbitmap *bitmap, - uint64_t start, - uint64_t *len) +xbitmap32_test( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t *len) { - struct xbitmap_node *bn; - uint64_t last = start + *len - 1; + struct xbitmap32_node *bn; + uint32_t last = start + *len - 1; - bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); if (!bn) return false; if (bn->bn_start <= start) { @@ -379,3 +568,17 @@ xbitmap_test( *len = bn->bn_start - start; return false; } + +/* Count the number of set regions in this bitmap. */ +uint32_t +xbitmap32_count_set_regions( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + uint32_t nr = 0; + + for_each_xbitmap32_extent(bn, bitmap) + nr++; + + return nr; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 4fe58bad6734..710c1ac5e323 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,17 +6,19 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap { +/* u64 bitmap */ + +struct xbitmap64 { struct rb_root_cached xb_root; }; -void xbitmap_init(struct xbitmap *bitmap); -void xbitmap_destroy(struct xbitmap *bitmap); +void xbitmap64_init(struct xbitmap64 *bitmap); +void xbitmap64_destroy(struct xbitmap64 *bitmap); -int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -uint64_t xbitmap_hweight(struct xbitmap *bitmap); +int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub); +uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap); /* * Return codes for the bitmap iterator functions are 0 to continue iterating, @@ -25,84 +27,41 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap); * iteration, because neither bitmap iterator ever generates that error code on * its own. Callers must not modify the bitmap while walking it. */ -typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); -int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, +typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn, void *priv); -bool xbitmap_empty(struct xbitmap *bitmap); -bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); +bool xbitmap64_empty(struct xbitmap64 *bitmap); +bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len); -/* Bitmaps, but for type-checked for xfs_agblock_t */ +/* u32 bitmap */ -struct xagb_bitmap { - struct xbitmap agbitmap; +struct xbitmap32 { + struct rb_root_cached xb_root; }; -static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) -{ - xbitmap_init(&bitmap->agbitmap); -} - -static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) -{ - xbitmap_destroy(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_clear(&bitmap->agbitmap, start, len); -} -static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_set(&bitmap->agbitmap, start, len); -} - -static inline bool -xagb_bitmap_test( - struct xagb_bitmap *bitmap, - xfs_agblock_t start, - xfs_extlen_t *len) -{ - uint64_t biglen = *len; - bool ret; - - ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); - - if (start + biglen >= UINT_MAX) { - ASSERT(0); - biglen = UINT_MAX - start; - } - - *len = biglen; - return ret; -} +void xbitmap32_init(struct xbitmap32 *bitmap); +void xbitmap32_destroy(struct xbitmap32 *bitmap); -static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, - struct xagb_bitmap *sub) -{ - return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); -} +int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub); +uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap); -static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) -{ - return xbitmap_hweight(&bitmap->agbitmap); -} -static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) -{ - return xbitmap_empty(&bitmap->agbitmap); -} +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv); +int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, + void *priv); -static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap_walk_fn fn, void *priv) -{ - return xbitmap_walk(&bitmap->agbitmap, fn, priv); -} +bool xbitmap32_empty(struct xbitmap32 *bitmap); +bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); -int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); -int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); +uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 9dfa310df311..5ab2ac53c920 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,9 +19,11 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/health.h" #include "xfs_ag.h" /* Set us up with an inode's bmap. */ @@ -48,9 +50,18 @@ xchk_setup_inode_bmap( if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + bool is_repair = xchk_could_repair(sc); xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + /* Break all our leases, we're going to mess with things. */ + if (is_repair) { + error = xfs_break_layouts(VFS_I(sc->ip), + &sc->ilock_flags, BREAK_WRITE); + if (error) + goto out; + } + inode_dio_wait(VFS_I(sc->ip)); /* @@ -71,6 +82,15 @@ xchk_setup_inode_bmap( error = filemap_fdatawait_keep_errors(mapping); if (error && (error != -ENOSPC && error != -EIO)) goto out; + + /* Drop the page cache if we're repairing block mappings. */ + if (is_repair) { + error = invalidate_inode_pages2( + VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + } /* Got the inode, lock it and we're ready to go. */ @@ -78,6 +98,10 @@ xchk_setup_inode_bmap( if (error) goto out; + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ @@ -410,7 +434,7 @@ xchk_bmap_iextent( /* Make sure the extent points to a valid place. */ if (info->is_rt && - !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) + !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (!info->is_rt && @@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps( } /* + * Decide if we want to scan the reverse mappings to determine if the attr + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_attrfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * Files can have an attr fork in EXTENTS format with zero records for + * several reasons: + * + * a) an attr set created a fork but ran out of space + * b) attr replace deleted an old attr but failed during the set step + * c) the data fork was in btree format when all attrs were deleted, so + * the fork was left in place + * d) the inode repair code zapped the fork + * + * Only in case (d) do we want to scan the rmapbt to see if we need to + * rebuild the attr fork. The fork zap code clears all DAC permission + * bits and zeroes the uid and gid, so avoid the scan if any of those + * three conditions are not met. + */ + if ((VFS_I(ip)->i_mode & 0777) != 0) + return false; + if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID)) + return false; + if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID)) + return false; + + return true; +} + +/* + * Decide if we want to scan the reverse mappings to determine if the data + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_datafork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_df; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(ip)) + return false; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * If we encounter an empty data fork along with evidence that the fork + * might not really be empty, we need to scan the reverse mappings to + * decide if we're going to rebuild the fork. Data forks with nonzero + * file size are scanned. + */ + return i_size_read(VFS_I(ip)) != 0; +} + +/* * Decide if we want to walk every rmap btree in the fs to make sure that each * rmap for this file fork has corresponding bmbt entries. */ @@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps( struct xchk_bmap_info *info) { struct xfs_scrub *sc = info->sc; - struct xfs_ifork *ifp; if (!xfs_has_rmapbt(sc->mp)) return false; @@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return false; - /* Don't support realtime rmap checks yet. */ - if (info->is_rt) - return false; - - /* - * The inode repair code zaps broken inode forks by resetting them back - * to EXTENTS format and zero extent records. If we encounter a fork - * in this state along with evidence that the fork isn't supposed to be - * empty, we need to scan the reverse mappings to decide if we're going - * to rebuild the fork. Data forks with nonzero file size are scanned. - * xattr forks are never empty of content, so they are always scanned. - */ - ifp = xfs_ifork_ptr(sc->ip, info->whichfork); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { - if (info->whichfork == XFS_DATA_FORK && - i_size_read(VFS_I(sc->ip)) == 0) - return false; - - return true; - } + if (info->whichfork == XFS_ATTR_FORK) + return xchk_bmap_check_empty_attrfork(sc->ip); - return false; + return xchk_bmap_check_empty_datafork(sc->ip); } /* Make sure each rmap has a corresponding bmbt entry. */ @@ -843,7 +924,7 @@ xchk_bmap( if (!ifp) return -ENOENT; - info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.is_rt = xfs_ifork_is_realtime(ip, whichfork); info.whichfork = whichfork; info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); info.sc = sc; @@ -945,7 +1026,20 @@ int xchk_bmap_data( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_DATA_FORK); + int error; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_DATA_FORK); + if (error) + return error; + + /* If the data fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED); + return 0; } /* Scrub an inode's attr fork. */ @@ -953,7 +1047,27 @@ int xchk_bmap_attr( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_ATTR_FORK); + int error; + + /* + * If the attr fork has been zapped, it's possible that forkoff was + * reset to zero and hence sc->ip->i_afp is NULL. We don't want the + * NULL ifp check in xchk_bmap to conclude that the attr fork is ok, + * so short circuit that logic by setting the corruption flag and + * returning immediately. + */ + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_ATTR_FORK); + if (error) + return error; + + /* If the attr fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED); + return 0; } /* Scrub an inode's CoW fork. */ diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c new file mode 100644 index 000000000000..4505f4829d53 --- /dev/null +++ b/fs/xfs/scrub/bmap_repair.c @@ -0,0 +1,873 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Fork Block Mapping (BMBT) Repair + * ====================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ + +enum reflink_scan_state { + RLS_IRRELEVANT = -1, /* not applicable to this file */ + RLS_UNKNOWN, /* shared extent scans required */ + RLS_SET_IFLAG, /* iflag must be set */ +}; + +struct xrep_bmap { + /* Old bmbt blocks */ + struct xfsb_bitmap old_bmbt_blocks; + + /* New fork. */ + struct xrep_newbt new_bmapbt; + + /* List of new bmap records. */ + struct xfarray *bmap_records; + + struct xfs_scrub *sc; + + /* How many blocks did we find allocated to this file? */ + xfs_rfsblock_t nblocks; + + /* How many bmbt blocks did we find for this fork? */ + xfs_rfsblock_t old_bmbt_block_count; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* How many real (non-hole, non-delalloc) mappings do we have? */ + uint64_t real_mappings; + + /* Which fork are we fixing? */ + int whichfork; + + /* What d the REFLINK flag be set when the repair is over? */ + enum reflink_scan_state reflink_scan; + + /* Do we allow unwritten extents? */ + bool allow_unwritten; +}; + +/* Is this space extent shared? Flag the inode if it is. */ +STATIC int +xrep_bmap_discover_shared( + struct xrep_bmap *rb, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + struct xfs_scrub *sc = rb->sc; + xfs_agblock_t agbno; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, + &fbno, &flen, false); + if (error) + return error; + + if (fbno != NULLAGBLOCK) + rb->reflink_scan = RLS_SET_IFLAG; + + return 0; +} + +/* Remember this reverse-mapping as a series of bmap records. */ +STATIC int +xrep_bmap_from_rmap( + struct xrep_bmap *rb, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + bool unwritten) +{ + struct xfs_bmbt_irec irec = { + .br_startoff = startoff, + .br_startblock = startblock, + .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM, + }; + struct xfs_bmbt_rec rbe; + struct xfs_scrub *sc = rb->sc; + int error = 0; + + /* + * If we're repairing the data fork of a non-reflinked regular file on + * a reflink filesystem, we need to figure out if this space extent is + * shared. + */ + if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) { + error = xrep_bmap_discover_shared(rb, startblock, blockcount); + if (error) + return error; + } + + do { + xfs_failaddr_t fa; + + irec.br_blockcount = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec); + if (fa) + return -EFSCORRUPTED; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec); + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + + rb->real_mappings++; + + irec.br_startblock += irec.br_blockcount; + irec.br_startoff += irec.br_blockcount; + blockcount -= irec.br_blockcount; + } while (blockcount > 0); + + return 0; +} + +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_fork_rmap( + struct xrep_bmap *rb, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + struct xfs_scrub *sc = rb->sc; + enum xbtree_recpacking outcome; + int error; + + /* + * Data extents for rt files are never stored on the data device, but + * everything else (xattrs, bmbt blocks) can be. + */ + if (XFS_IS_REALTIME_INODE(sc->ip) && + !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) + return -EFSCORRUPTED; + + /* Check that this is within the AG. */ + if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check the file offset range. */ + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* No contradictory flags. */ + if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) && + (rec->rm_flags & XFS_RMAP_UNWRITTEN)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rm_startblock, rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t fsbno; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_fork_rmap(rb, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + /* Reject unwritten extents if we don't allow those. */ + if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) + return -EFSCORRUPTED; + + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, + rec->rm_startblock); + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + rb->old_bmbt_block_count += rec->rm_blockcount; + return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno, + rec->rm_blockcount); + } + + return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno, + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* + * Compare two block mapping records. We want to sort in order of increasing + * file offset. + */ +static int +xrep_bmap_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_bmbt_rec *ba = a; + const struct xfs_bmbt_rec *bb = b; + xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba); + xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb); + + if (ao > bo) + return 1; + else if (ao < bo) + return -1; + return 0; +} + +/* + * Sort the bmap extents by fork offset or else the records will be in the + * wrong order. Ensure there are no overlaps in the file offset ranges. + */ +STATIC int +xrep_bmap_sort_records( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t next_off = 0; + xfarray_idx_t array_cur; + int error; + + error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + if (irec.br_startoff < next_off) + return -EFSCORRUPTED; + + next_off = irec.br_startoff + irec.br_blockcount; + } + + return 0; +} + +/* Scan one AG for reverse mappings that we can turn into extent maps. */ +STATIC int +xrep_bmap_scan_ag( + struct xrep_bmap *rb, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Find the delalloc extents from the old incore extent tree. */ +STATIC int +xrep_bmap_find_delalloc( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + struct xfs_bmbt_rec rbe; + struct xfs_inode *ip = rb->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork); + int error = 0; + + /* + * Skip this scan if we don't expect to find delayed allocation + * reservations in this fork. + */ + if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0) + return 0; + + for_each_xfs_iext(ifp, &icur, &irec) { + if (!isnullstartblock(irec.br_startblock)) + continue; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(ip, rb->whichfork, &irec); + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + } + + return 0; +} + +/* + * Collect block mappings for this fork of this inode and decide if we have + * enough space to rebuild. Caller is responsible for cleaning up the list if + * anything goes wrong. + */ +STATIC int +xrep_bmap_find_mappings( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error = 0; + + /* Iterate the rmaps for extents. */ + for_each_perag(sc->mp, agno, pag) { + error = xrep_bmap_scan_ag(rb, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + return xrep_bmap_find_delalloc(rb); +} + +/* Retrieve real extent mappings for bulk loading the bmap btree. */ +STATIC int +xrep_bmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_bmbt_rec rec; + struct xfs_bmbt_irec *irec = &cur->bc_rec.b; + struct xrep_bmap *rb = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(rb->bmap_records, rb->array_cur++, + &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, irec); + } while (isnullstartblock(irec->br_startblock)); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_bmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_bmap *rb = priv; + + return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_bmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + ASSERT(level > 0); + + return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level); +} + +/* Update the inode counters. */ +STATIC int +xrep_bmap_reset_counters( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int64_t delta; + + if (rb->reflink_scan == RLS_SET_IFLAG) + sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * Update the inode block counts to reflect the extents we found in the + * rmapbt. + */ + delta = ifake->if_blocks - rb->old_bmbt_block_count; + sc->ip->i_nblocks = rb->nblocks + delta; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* + * Adjust the quota counts by the difference in size between the old + * and new bmbt. + */ + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); + return 0; +} + +/* + * Create a new iext tree and load it with block mappings. If the inode is + * in extents format, that's all we need to do to commit the new mappings. + * If it is in btree format, this takes care of preloading the incore tree. + */ +STATIC int +xrep_bmap_extents_load( + struct xrep_bmap *rb) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork; + xfarray_idx_t array_cur; + int error; + + ASSERT(ifp->if_bytes == 0); + + /* Add all the mappings (incl. delalloc) to the incore extent tree. */ + xfs_iext_first(ifp, &icur); + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + xfs_iext_insert_raw(ifp, &icur, &irec); + if (!isnullstartblock(irec.br_startblock)) + ifp->if_nextents++; + + xfs_iext_next(ifp, &icur); + } + + return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork, + ifp->if_nextents); +} + +/* + * Reserve new btree blocks, bulk load the bmap records into the ondisk btree, + * and load the incore extent tree. + */ +STATIC int +xrep_bmap_btree_load( + struct xrep_bmap *rb, + struct xfs_btree_cur *bmap_cur) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(bmap_cur, + &rb->new_bmapbt.bload, rb->real_mappings); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire bmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. We're allowed to exceed file + * quota to repair inconsistent metadata. + */ + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, + rb->new_bmapbt.bload.nr_blocks, 0, true); + if (error) + return error; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rb->new_bmapbt, + rb->new_bmapbt.bload.nr_blocks); + if (error) + return error; + + /* Add all observed bmap records. */ + rb->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb); + if (error) + return error; + + /* + * Load the new bmap records into the new incore extent tree to + * preserve delalloc reservations for regular files. The directory + * code loads the extent tree during xfs_dir_open and assumes + * thereafter that it remains loaded, so we must not violate that + * assumption. + */ + return xrep_bmap_extents_load(rb); +} + +/* + * Use the collected bmap information to stage a new bmap fork. If this is + * successful we'll return with the new fork information logged to the repair + * transaction but not yet committed. The caller must ensure that the inode + * is joined to the transaction; the inode will be joined to a clean + * transaction when the function returns. + */ +STATIC int +xrep_bmap_build_new_fork( + struct xrep_bmap *rb) +{ + struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *bmap_cur; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int error; + + error = xrep_bmap_sort_records(rb); + if (error) + return error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork, + &oinfo); + if (error) + return error; + + rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; + rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; + rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; + + /* + * Allocate a new bmap btree cursor for reloading an inode block mapping + * data structure. + */ + bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK); + xfs_btree_stage_ifakeroot(bmap_cur, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the bmap records we've found. Join the inode to the transaction + * so that we can roll the transaction while holding the inode locked. + */ + if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) { + ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS; + error = xrep_bmap_extents_load(rb); + } else { + ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE; + error = xrep_bmap_btree_load(rb, bmap_cur); + } + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork); + xfs_btree_del_cursor(bmap_cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_bmap_reset_counters(rb); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rb->new_bmapbt); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_cur: + if (bmap_cur) + xfs_btree_del_cursor(bmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rb->new_bmapbt); + return error; +} + +/* + * Now that we've logged the new inode btree, invalidate all of the old blocks + * and free them, if there were any. + */ +STATIC int +xrep_bmap_remove_old_tree( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_owner_info oinfo; + + /* Free the old bmbt blocks if they're not in use. */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo); +} + +/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */ +STATIC int +xrep_bmap_check_inputs( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); + + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + + /* No fork means nothing to rebuild. */ + if (!ifp) + return -ECANCELED; + + /* + * We only know how to repair extent mappings, which is to say that we + * only support extents and btree fork format. Repairs to a local + * format fork require a higher level repair function, so we do not + * have any work to do here. + */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + return -ECANCELED; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return -EFSCORRUPTED; + } + + if (whichfork == XFS_ATTR_FORK) + return 0; + + /* Only files, symlinks, and directories get to have data forks. */ + switch (VFS_I(sc->ip)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + /* ok */ + break; + default: + return -EINVAL; + } + + /* Don't know how to rebuild realtime data forks. */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + return 0; +} + +/* Set up the initial state of the reflink scan. */ +static inline enum reflink_scan_state +xrep_bmap_init_reflink_scan( + struct xfs_scrub *sc, + int whichfork) +{ + /* cannot share on non-reflink filesystem */ + if (!xfs_has_reflink(sc->mp)) + return RLS_IRRELEVANT; + + /* preserve flag if it's already set */ + if (xfs_is_reflink_inode(sc->ip)) + return RLS_SET_IFLAG; + + /* can only share regular files */ + if (!S_ISREG(VFS_I(sc->ip)->i_mode)) + return RLS_IRRELEVANT; + + /* cannot share attr fork extents */ + if (whichfork != XFS_DATA_FORK) + return RLS_IRRELEVANT; + + /* cannot share realtime extents */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return RLS_IRRELEVANT; + + return RLS_UNKNOWN; +} + +/* Repair an inode fork. */ +int +xrep_bmap( + struct xfs_scrub *sc, + int whichfork, + bool allow_unwritten) +{ + struct xrep_bmap *rb; + char *descr; + xfs_extnum_t max_bmbt_recs; + bool large_extcount; + int error = 0; + + error = xrep_bmap_check_inputs(sc, whichfork); + if (error == -ECANCELED) + return 0; + if (error) + return error; + + rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS); + if (!rb) + return -ENOMEM; + rb->sc = sc; + rb->whichfork = whichfork; + rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + rb->allow_unwritten = allow_unwritten; + + /* Set up enough storage to handle the max records for this fork. */ + large_extcount = xfs_has_large_extent_counts(sc->mp); + max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork); + descr = xchk_xfile_ino_descr(sc, "%s fork mapping records", + whichfork == XFS_DATA_FORK ? "data" : "attr"); + error = xfarray_create(descr, max_bmbt_recs, + sizeof(struct xfs_bmbt_rec), &rb->bmap_records); + kfree(descr); + if (error) + goto out_rb; + + /* Collect all reverse mappings for this fork's extents. */ + xfsb_bitmap_init(&rb->old_bmbt_blocks); + error = xrep_bmap_find_mappings(rb); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the bmap information. */ + error = xrep_bmap_build_new_fork(rb); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_bmap_remove_old_tree(rb); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rb->old_bmbt_blocks); + xfarray_destroy(rb->bmap_records); +out_rb: + kfree(rb); + return error; +} + +/* Repair an inode's data fork. */ +int +xrep_bmap_data( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_DATA_FORK, true); +} + +/* Repair an inode's attr fork. */ +int +xrep_bmap_attr( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_ATTR_FORK, false); +} diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index c3a9f33e5a8d..fe678a0438bc 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -47,7 +47,7 @@ __xchk_btree_process_error( *error = 0; fallthrough; default: - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, *error, ret_ip); else @@ -91,7 +91,7 @@ __xchk_btree_set_corrupt( { sc->sm->sm_flags |= errflag; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_error(sc, cur, level, ret_ip); else @@ -168,7 +168,7 @@ xchk_btree_rec( if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is high_key(rec) no larger than the parent high key? */ @@ -215,7 +215,7 @@ xchk_btree_key( if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is this block's high key no larger than the parent high key? */ @@ -236,22 +236,18 @@ xchk_btree_ptr_ok( int level, union xfs_btree_ptr *ptr) { - bool res; - /* A btree rooted in an inode has no block pointer to the root. */ - if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == bs->cur->bc_nlevels) return true; /* Otherwise, check the pointers. */ - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) - res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); - else - res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); - if (!res) + if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return false; + } - return res; + return true; } /* Check that a btree block's sibling matches what we expect it. */ @@ -374,14 +370,12 @@ xchk_btree_check_block_owner( { xfs_agnumber_t agno; xfs_agblock_t agbno; - xfs_btnum_t btnum; bool init_sa; int error = 0; if (!bs->cur) return 0; - btnum = bs->cur->bc_btnum; agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); @@ -390,7 +384,7 @@ xchk_btree_check_block_owner( * sc->sa so that we can check for the presence of an ownership record * in the rmap btree for the AG containing the block. */ - init_sa = bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE; + init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG; if (init_sa) { error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, @@ -404,11 +398,11 @@ xchk_btree_check_block_owner( * have to nullify it (to shut down further block owner checks) if * self-xref encounters problems. */ - if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops)) bs->cur = NULL; xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); - if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops)) bs->cur = NULL; out_free: @@ -434,7 +428,7 @@ xchk_btree_check_owner( * up. */ if (bp == NULL) { - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE) xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -447,7 +441,7 @@ xchk_btree_check_owner( * duplicate cursors. Therefore, save the buffer daddr for * later scanning. */ - if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) { struct check_owner *co; co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); @@ -480,7 +474,7 @@ xchk_btree_check_iroot_minrecs( * existing filesystems, so instead we disable the check for data fork * bmap btrees when there's an attr fork. */ - if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && + if (xfs_btree_is_bmap(bs->cur->bc_ops) && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && xfs_inode_has_attr_fork(bs->sc->ip)) return false; @@ -513,7 +507,7 @@ xchk_btree_check_minrecs( * child block might be less than the standard minrecs, but that's ok * provided that there's only one direct child of the root. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 2) { struct xfs_btree_block *root_block; struct xfs_buf *root_bp; @@ -567,7 +561,7 @@ xchk_btree_block_check_keys( return; } - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Make sure the high key of this block matches the parent. */ @@ -590,7 +584,6 @@ xchk_btree_get_block( struct xfs_btree_block **pblock, struct xfs_buf **pbp) { - xfs_failaddr_t failed_at; int error; *pblock = NULL; @@ -602,13 +595,7 @@ xchk_btree_get_block( return error; xfs_btree_get_block(bs->cur, level, pbp); - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) - failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, - level, *pbp); - else - failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, - level, *pbp); - if (failed_at) { + if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -669,7 +656,7 @@ xchk_btree_block_keys( if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Get high keys */ @@ -733,7 +720,7 @@ xchk_btree( * error codes for us. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr)) goto out; error = xchk_btree_get_block(bs, level, &ptr, &block, &bp); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index f10cd4fb0abd..22f5f1a9d3f0 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -25,9 +25,15 @@ #include "xfs_trans_priv.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_dir2.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -81,6 +87,15 @@ __xchk_process_error( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -88,8 +103,7 @@ __xchk_process_error( *error = 0; fallthrough; default: - trace_xchk_op_error(sc, agno, bno, *error, - ret_ip); + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); break; } return false; @@ -135,6 +149,16 @@ __xchk_fblock_process_error( /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_file_op_error(sc, whichfork, offset, *error, + ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -226,6 +250,19 @@ xchk_block_set_corrupt( trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } +#ifdef CONFIG_XFS_QUOTA +/* Record a corrupt quota counter. */ +void +xchk_qcheck_set_corrupt( + struct xfs_scrub *sc, + unsigned int dqtype, + xfs_dqid_t id) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_qcheck_error(sc, dqtype, id, __return_address); +} +#endif + /* Record a corruption while cross-referencing. */ void xchk_block_xref_set_corrupt( @@ -411,7 +448,7 @@ xchk_perag_read_headers( { int error; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -426,7 +463,7 @@ xchk_perag_read_headers( * Grab the AG headers for the attached perag structure and wait for pending * intents to drain. */ -static int +int xchk_perag_drain_and_lock( struct xfs_scrub *sc) { @@ -554,46 +591,50 @@ xchk_ag_btcur_init( { struct xfs_mount *mp = sc->mp; - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { + if (sa->agf_bp) { /* Set up a bnobt cursor for cross-referencing. */ - sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_BNO); - } + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, + XFS_SCRUB_TYPE_BNOBT); - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ - sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_CNT); - } - - /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { - sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_INO); - } - - /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_has_finobt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { - sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_FINO); - } - - /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_rmapbt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { - sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, + XFS_SCRUB_TYPE_CNTBT); + + /* Set up a rmapbt cursor for cross-referencing. */ + if (xfs_has_rmapbt(mp)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, + XFS_SCRUB_TYPE_RMAPBT); + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (xfs_has_reflink(mp)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, + XFS_SCRUB_TYPE_REFCNTBT); + } } - /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_reflink(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { - sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, sa->pag); + if (sa->agi_bp) { + /* Set up a inobt cursor for cross-referencing. */ + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, + XFS_SCRUB_TYPE_INOBT); + + /* Set up a finobt cursor for cross-referencing. */ + if (xfs_has_finobt(mp)) { + sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, + XFS_SCRUB_TYPE_FINOBT); + } } } @@ -604,6 +645,7 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); + xrep_reset_perag_resv(sc); if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; @@ -651,6 +693,13 @@ xchk_trans_cancel( sc->tp = NULL; } +int +xchk_trans_alloc_empty( + struct xfs_scrub *sc) +{ + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -670,7 +719,7 @@ xchk_trans_alloc( return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, resblks, 0, 0, &sc->tp); - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* Set us up with a transaction and an empty context. */ @@ -781,7 +830,7 @@ again: * in the iget cache miss path. */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); xfs_perag_put(pag); if (error) return error; @@ -818,6 +867,26 @@ again: return 0; } +#ifdef CONFIG_XFS_QUOTA +/* + * Try to attach dquots to this inode if we think we might want to repair it. + * Callers must not hold any ILOCKs. If the dquots are broken and cannot be + * attached, a quotacheck will be scheduled. + */ +int +xchk_ino_dqattach( + struct xfs_scrub *sc) +{ + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + if (!xchk_could_repair(sc)) + return 0; + + return xrep_ino_dqattach(sc); +} +#endif + /* Install an inode that we opened by handle for scrubbing. */ int xchk_install_handle_inode( @@ -1021,6 +1090,11 @@ xchk_setup_inode_contents( error = xchk_trans_alloc(sc, resblks); if (error) goto out; + + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ @@ -1125,25 +1199,12 @@ xchk_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; + struct xfs_scrub_subord *sub; int error; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - break; - } - - sc->sm->sm_type = smtype; + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); + xchk_scrub_free_subord(sub); return error; } @@ -1222,6 +1283,15 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_DRAIN) xfs_drain_wait_enable(); + if (scrub_fsgates & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_enable(); + + if (scrub_fsgates & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_enable(); + + if (scrub_fsgates & XCHK_FSGATES_RMAP) + xfs_rmap_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c83cf9e5b55f..eb00d48590f2 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -6,32 +6,8 @@ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ -/* - * We /could/ terminate a scrub/repair operation early. If we're not - * in a good place to continue (fatal signal, etc.) then bail out. - * Note that we're careful not to make any judgements about *error. - */ -static inline bool -xchk_should_terminate( - struct xfs_scrub *sc, - int *error) -{ - /* - * If preemption is disabled, we need to yield to the scheduler every - * few seconds so that we don't run afoul of the soft lockup watchdog - * or RCU stall detector. - */ - cond_resched(); - - if (fatal_signal_pending(current)) { - if (*error == 0) - *error = -EINTR; - return true; - } - return false; -} - int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -54,6 +30,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc, void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset); +#ifdef CONFIG_XFS_QUOTA +void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype, + xfs_dqid_t id); +#endif void xchk_block_xref_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); @@ -73,6 +53,11 @@ int xchk_checkpoint_log(struct xfs_mount *mp); bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); +static inline int xchk_setup_nothing(struct xfs_scrub *sc) +{ + return -ENOENT; +} + /* Setup functions */ int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); @@ -87,35 +72,34 @@ int xchk_setup_directory(struct xfs_scrub *sc); int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); +int xchk_setup_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); #else -static inline int -xchk_setup_rtbitmap(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_setup_rtsummary(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_setup_rtbitmap xchk_setup_nothing +# define xchk_setup_rtsummary xchk_setup_nothing #endif #ifdef CONFIG_XFS_QUOTA +int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); +int xchk_setup_quotacheck(struct xfs_scrub *sc); #else static inline int -xchk_setup_quota(struct xfs_scrub *sc) +xchk_ino_dqattach(struct xfs_scrub *sc) { - return -ENOENT; + return 0; } +# define xchk_setup_quota xchk_setup_nothing +# define xchk_setup_quotacheck xchk_setup_nothing #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); +int xchk_perag_drain_and_lock(struct xfs_scrub *sc); /* * Grab all AG resources, treating the inability to grab the perag structure as @@ -192,7 +176,9 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } -#ifdef CONFIG_XFS_ONLINE_REPAIR +bool xchk_dir_looks_zapped(struct xfs_inode *dp); +bool xchk_pptr_looks_zapped(struct xfs_inode *ip); + /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -200,9 +186,18 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT | XFS_SCRUB_OFLAG_PREEN); } -#else -# define xchk_needs_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +/* + * "Should we prepare for a repair?" + * + * Return true if the caller permits us to repair metadata and we're not + * setting up for a post-repair evaluation. + */ +static inline bool xchk_could_repair(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED); +} int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -213,6 +208,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ (sc)->mp->m_super->s_id, ##__VA_ARGS__) +#define xchk_xfile_ag_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) +#define xchk_xfile_ino_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c new file mode 100644 index 000000000000..4de3f0f40f48 --- /dev/null +++ b/fs/xfs/scrub/cow_repair.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_refcount_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/off_bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/reap.h" + +/* + * CoW Fork Mapping Repair + * ======================= + * + * Although CoW staging extents are owned by incore CoW inode forks, on disk + * they are owned by the refcount btree. The ondisk metadata does not record + * any ownership information, which limits what we can do to repair the + * mappings in the CoW fork. At most, we can replace ifork mappings that lack + * an entry in the refcount btree or are described by a reverse mapping record + * whose owner is not OWN_COW. + * + * Replacing extents is also tricky -- we can't touch written CoW fork extents + * since they are undergoing writeback, and delalloc extents do not require + * repair since they only exist incore. Hence the most we can do is find the + * bad parts of unwritten mappings, allocate a replacement set of blocks, and + * replace the incore mapping. We use the regular reaping process to unmap + * or free the discarded blocks, as appropriate. + */ +struct xrep_cow { + struct xfs_scrub *sc; + + /* Bitmap of file offset ranges that need replacing. */ + struct xoff_bitmap bad_fileoffs; + + /* Bitmap of fsblocks that were removed from the CoW fork. */ + struct xfsb_bitmap old_cowfork_fsblocks; + + /* CoW fork mappings used to scan for bad CoW staging extents. */ + struct xfs_bmbt_irec irec; + + /* refcount btree block number of irec.br_startblock */ + unsigned int irec_startbno; + + /* refcount btree block number of the next refcount record we expect */ + unsigned int next_bno; +}; + +/* CoW staging extent. */ +struct xrep_cow_extent { + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +/* + * Mark the part of the file range that corresponds to the given physical + * space. Caller must ensure that the physical range is within xc->irec. + */ +STATIC int +xrep_cow_mark_file_range( + struct xrep_cow *xc, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + xfs_fileoff_t startoff; + + startoff = xc->irec.br_startoff + + (startblock - xc->irec.br_startblock); + + trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, + blockcount); + + return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); +} + +/* + * Trim @src to fit within the CoW fork mapping being examined, and put the + * result in @dst. + */ +static inline void +xrep_cow_trim_refcount( + struct xrep_cow *xc, + struct xfs_refcount_irec *dst, + const struct xfs_refcount_irec *src) +{ + unsigned int adj; + + memcpy(dst, src, sizeof(*dst)); + + if (dst->rc_startblock < xc->irec_startbno) { + adj = xc->irec_startbno - dst->rc_startblock; + dst->rc_blockcount -= adj; + dst->rc_startblock += adj; + } + + if (dst->rc_startblock + dst->rc_blockcount > + xc->irec_startbno + xc->irec.br_blockcount) { + adj = (dst->rc_startblock + dst->rc_blockcount) - + (xc->irec_startbno + xc->irec.br_blockcount); + dst->rc_blockcount -= adj; + } +} + +/* Mark any shared CoW staging extents. */ +STATIC int +xrep_cow_mark_shared_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + xfs_fsblock_t fsbno; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_SHARED) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + rrec.rc_startblock); + return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); +} + +/* + * Mark any portion of the CoW fork file offset range where there is not a CoW + * staging extent record in the refcountbt, and keep a record of where we did + * find correct refcountbt records. Staging records are always cleaned out at + * mount time, so any two inodes trying to map the same staging area would have + * already taken the fs down due to refcount btree verifier errors. Hence this + * inode should be the sole creator of the staging extent records ondisk. + */ +STATIC int +xrep_cow_mark_missing_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + int error; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + if (xc->next_bno >= rrec.rc_startblock) + goto next; + + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + xc->next_bno), + rrec.rc_startblock - xc->next_bno); + if (error) + return error; + +next: + xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; + return 0; +} + +/* + * Mark any area that does not correspond to a CoW staging rmap. These are + * cross-linked areas that must be avoided. + */ +STATIC int +xrep_cow_mark_missing_staging_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + xfs_fsblock_t fsbno; + xfs_agblock_t rec_bno; + xfs_extlen_t rec_len; + unsigned int adj; + + if (rec->rm_owner == XFS_RMAP_OWN_COW) + return 0; + + rec_bno = rec->rm_startblock; + rec_len = rec->rm_blockcount; + if (rec_bno < xc->irec_startbno) { + adj = xc->irec_startbno - rec_bno; + rec_len -= adj; + rec_bno += adj; + } + + if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { + adj = (rec_bno + rec_len) - + (xc->irec_startbno + xc->irec.br_blockcount); + rec_len -= adj; + } + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); + return xrep_cow_mark_file_range(xc, fsbno, rec_len); +} + +/* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_perag *pag; + struct xfs_scrub *sc = xc->sc; + xfs_agnumber_t agno; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); + xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); + + pag = xfs_perag_get(sc->mp, agno); + if (!pag) + return -EFSCORRUPTED; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + goto out_pag; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sa; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sa; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, + xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sa; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sa; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone turned + * on the debugging knob, replace everything in the CoW fork. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + return error; + } + +out_sa: + xchk_ag_free(sc, &sc->sa); +out_pag: + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate a replacement CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = sc->mp, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minlen = 1, + .maxlen = maxlen, + .prod = 1, + .resv = XFS_AG_RESV_NONE, + .datatype = XFS_ALLOC_USERDATA, + }; + int error; + + error = xfs_trans_reserve_more(sc->tp, maxlen, 0); + if (error) + return error; + + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + + repl->fsbno = args.fsbno; + repl->len = args.len; + return 0; +} + +/* + * Look up the current CoW fork mapping so that we only allocate enough to + * replace a single mapping. If we don't find a mapping that covers the start + * of the file range, or we find a delalloc or written extent, something is + * seriously wrong, since we didn't drop the ILOCK. + */ +static inline int +xrep_cow_find_mapping( + struct xrep_cow *xc, + struct xfs_iext_cursor *icur, + xfs_fileoff_t startoff, + struct xfs_bmbt_irec *got) +{ + struct xfs_inode *ip = xc->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + + if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) + goto bad; + + if (got->br_startoff > startoff) + goto bad; + + if (got->br_blockcount == 0) + goto bad; + + if (isnullstartblock(got->br_startblock)) + goto bad; + + if (xfs_bmap_is_written_extent(got)) + goto bad; + + return 0; +bad: + ASSERT(0); + return -EFSCORRUPTED; +} + +#define REPLACE_LEFT_SIDE (1U << 0) +#define REPLACE_RIGHT_SIDE (1U << 1) + +/* + * Given a CoW fork mapping @got and a replacement mapping @repl, remap the + * beginning of @got with the space described by @rep. + */ +static inline void +xrep_cow_replace_mapping( + struct xfs_inode *ip, + struct xfs_iext_cursor *icur, + const struct xfs_bmbt_irec *got, + const struct xrep_cow_extent *repl) +{ + struct xfs_bmbt_irec new = *got; /* struct copy */ + + ASSERT(repl->len > 0); + ASSERT(!isnullstartblock(got->br_startblock)); + + trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); + + if (got->br_blockcount == repl->len) { + /* + * The new extent is a complete replacement for the existing + * extent. Update the COW fork record. + */ + new.br_startblock = repl->fsbno; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + return; + } + + /* + * The new extent can replace the beginning of the COW fork record. + * Move the left side of @got upwards, then insert the new record. + */ + new.br_startoff += repl->len; + new.br_startblock += repl->len; + new.br_blockcount -= repl->len; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + + new.br_startoff = got->br_startoff; + new.br_startblock = repl->fsbno; + new.br_blockcount = repl->len; + xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); +} + +/* + * Replace the unwritten CoW staging extent backing the given file range with a + * new space extent that isn't as problematic. + */ +STATIC int +xrep_cow_replace_range( + struct xrep_cow *xc, + xfs_fileoff_t startoff, + xfs_extlen_t *blockcount) +{ + struct xfs_iext_cursor icur; + struct xrep_cow_extent repl; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = xc->sc; + xfs_fileoff_t nextoff; + xfs_extlen_t alloc_len; + int error; + + /* + * Put the existing CoW fork mapping in @got. If @got ends before + * @rep, truncate @rep so we only replace one extent mapping at a time. + */ + error = xrep_cow_find_mapping(xc, &icur, startoff, &got); + if (error) + return error; + nextoff = min(startoff + *blockcount, + got.br_startoff + got.br_blockcount); + + /* + * Allocate a replacement extent. If we don't fill all the blocks, + * shorten the quantity that will be deleted in this step. + */ + alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, + nextoff - startoff); + error = xrep_cow_alloc(sc, alloc_len, &repl); + if (error) + return error; + + /* + * Replace the old mapping with the new one, and commit the metadata + * changes made so far. + */ + xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); + + xfs_inode_set_cowblocks_tag(sc->ip); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* Note the old CoW staging extents; we'll reap them all later. */ + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, + repl.len); + if (error) + return error; + + *blockcount = repl.len; + return 0; +} + +/* + * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc + * reservation. + */ +STATIC int +xrep_cow_replace( + uint64_t startoff, + uint64_t blockcount, + void *priv) +{ + struct xrep_cow *xc = priv; + int error = 0; + + while (blockcount > 0) { + xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + error = xrep_cow_replace_range(xc, startoff, &len); + if (error) + break; + + blockcount -= len; + startoff += len; + } + + return error; +} + +/* + * Repair an inode's CoW fork. The CoW fork is an in-core structure, so + * there's no btree to rebuid. Instead, we replace any mappings that are + * cross-linked or lack ondisk CoW fork records in the refcount btree. + */ +int +xrep_bmap_cow( + struct xfs_scrub *sc) +{ + struct xrep_cow *xc; + struct xfs_iext_cursor icur; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); + int error; + + if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) + return -EOPNOTSUPP; + + if (!ifp) + return 0; + + /* realtime files aren't supported yet */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + /* + * If we're somehow not in extents format, then reinitialize it to + * an empty extent mapping fork and exit. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + return 0; + } + + xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); + if (!xc) + return -ENOMEM; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + xc->sc = sc; + xoff_bitmap_init(&xc->bad_fileoffs); + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + + for_each_xfs_iext(ifp, &icur, &xc->irec) { + if (xchk_should_terminate(sc, &error)) + goto out_bitmap; + + /* + * delalloc reservations only exist incore, so there is no + * ondisk metadata that we can examine. Hence we leave them + * alone. + */ + if (isnullstartblock(xc->irec.br_startblock)) + continue; + + /* + * COW fork extents are only in the written state if writeback + * is actively writing to disk. We cannot restart the write + * at a different disk address since we've already issued the + * IO, so we leave these alone and hope for the best. + */ + if (xfs_bmap_is_written_extent(&xc->irec)) + continue; + + error = xrep_cow_find_bad(xc); + if (error) + goto out_bitmap; + } + + /* Replace any bad unwritten mappings with fresh reservations. */ + error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); + if (error) + goto out_bitmap; + + /* + * Reap as many of the old CoW blocks as we can. They are owned ondisk + * by the refcount btree, not the inode, so it is correct to treat them + * like inode metadata. + */ + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + xoff_bitmap_destroy(&xc->bad_fileoffs); + kfree(xc); + return error; +} diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h new file mode 100644 index 000000000000..0c6e3aad4395 --- /dev/null +++ b/fs/xfs/scrub/dab_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DAB_BITMAP_H__ +#define __XFS_SCRUB_DAB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_dablk_t */ + +struct xdab_bitmap { + struct xbitmap32 dabitmap; +}; + +static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->dabitmap); +} + +static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->dabitmap); +} + +static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->dabitmap, dabno, len); +} + +static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->dabitmap, dabno, len); +} + +#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 82b150d3b8b7..056de4819f86 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -78,6 +78,22 @@ xchk_da_set_corrupt( __return_address); } +/* Flag a da btree node in need of optimization. */ +void +xchk_da_set_preen( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_scrub *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +/* Find an entry at a certain level in a da btree. */ static struct xfs_da_node_entry * xchk_da_btree_node_entry( struct xchk_da_btree *ds, @@ -320,6 +336,7 @@ xchk_da_btree_block( struct xfs_da3_blkinfo *hdr3; struct xfs_da_args *dargs = &ds->dargs; struct xfs_inode *ip = ds->dargs.dp; + xfs_failaddr_t fa; xfs_ino_t owner; int *pmaxrecs; struct xfs_da3_icnode_hdr nodehdr; @@ -442,6 +459,12 @@ xchk_da_btree_block( goto out_freebp; } + fa = xfs_da3_header_check(blk->bp, dargs->owner); + if (fa) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + /* * If we've been handed a block that is below the dabtree root, does * its hashval match what the parent block expected to see? @@ -494,6 +517,7 @@ xchk_da_btree( ds->dargs.whichfork = whichfork; ds->dargs.trans = sc->tp; ds->dargs.op_flags = XFS_DA_OP_OKNOENT; + ds->dargs.owner = sc->ip->i_ino; ds->state = xfs_da_state_alloc(&ds->dargs); ds->sc = sc; ds->private = private; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 4f8c2138a1ec..de291e3b77dd 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); /* Check for da btree corruption. */ void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level); +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); + +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp); int xchk_da_btree(struct xfs_scrub *sc, int whichfork, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 0b491784b759..bf9199e8df63 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -15,21 +15,71 @@ #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_health.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" +#include "scrub/health.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" /* Set us up to scrub directories. */ int xchk_setup_directory( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_directory(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } /* Directories */ +/* Deferred directory entry that we saved for later. */ +struct xchk_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Child inode number. */ + xfs_ino_t ino; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_dir { + struct xfs_scrub *sc; + + /* information for parent pointer validation. */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Fixed-size array of xchk_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing dirent names. */ + struct xfblob *dir_names; + + /* If we've cycled the ILOCK, we must revalidate deferred dirents. */ + bool need_revalidate; + + /* Name buffer for dirent revalidation. */ + struct xfs_name xname; + uint8_t namebuf[MAXNAMELEN]; +}; + /* Scrub a directory entry. */ /* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ @@ -53,6 +103,108 @@ xchk_dir_check_ftype( } /* + * Try to lock a child file for checking parent pointers. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_dir_lock_child( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the backwards link (parent pointer) associated with this dirent. */ +STATIC int +xchk_dir_parent_pointer( + struct xchk_dir *sd, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + int error; + + xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip); + error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec, + &sd->pptr_args); + if (error == -ENOATTR) + xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0); + + return 0; +} + +/* Look for a parent pointer matching this dirent, if the child isn't busy. */ +STATIC int +xchk_dir_check_pptr_fast( + struct xchk_dir *sd, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + unsigned int lockmode; + int error; + + /* dot and dotdot entries do not have parent pointers */ + if (xfs_dir2_samename(name, &xfs_name_dot) || + xfs_dir2_samename(name, &xfs_name_dotdot)) + return 0; + + /* No self-referential non-dot or dotdot dirents. */ + if (ip == sc->ip) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return -ECANCELED; + } + + /* Try to lock the inode. */ + lockmode = xchk_dir_lock_child(sc, ip); + if (!lockmode) { + struct xchk_dirent save_de = { + .namelen = name->len, + .ino = ip->i_ino, + }; + + /* Couldn't lock the inode, so save the dirent for later. */ + trace_xchk_dir_defer(sc->ip, name, ip->i_ino); + + error = xfblob_storename(sd->dir_names, &save_de.name_cookie, + name); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + error = xfarray_append(sd->dir_entries, &save_de); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + return 0; + } + + error = xchk_dir_parent_pointer(sd, name, ip); + xfs_iunlock(ip, lockmode); + return error; +} + +/* * Scrub a single directory entry. * * Check the inode number to make sure it's sane, then we check that we can @@ -69,6 +221,7 @@ xchk_dir_actor( { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; + struct xchk_dir *sd = priv; xfs_ino_t lookup_ino; xfs_dablk_t offset; int error = 0; @@ -91,11 +244,11 @@ xchk_dir_actor( return -ECANCELED; } - if (!strncmp(".", name->name, name->len)) { + if (xfs_dir2_samename(name, &xfs_name_dot)) { /* If this is "." then check that the inum matches the dir. */ if (ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - } else if (!strncmp("..", name->name, name->len)) { + } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. @@ -135,6 +288,14 @@ xchk_dir_actor( goto out; xchk_dir_check_ftype(sc, offset, ip, name->type); + + if (xfs_has_parent(mp)) { + error = xchk_dir_check_pptr_fast(sd, dapos, name, ip); + if (error) + goto out_rele; + } + +out_rele: xchk_irele(sc, ip); out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -194,8 +355,8 @@ xchk_dir_rec( xchk_da_set_corrupt(ds, level); goto out; } - error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, - XFS_DABUF_MAP_HOLE_OK, &bp); + error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner, + rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp); if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, &error)) goto out; @@ -313,10 +474,11 @@ xchk_directory_data_bestfree( /* dir block format */ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); - error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp); } else { /* dir data format */ - error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); + error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, + 0, &bp); } if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -468,7 +630,7 @@ xchk_directory_leaf1_bestfree( int error; /* Read the free space block. */ - error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -529,10 +691,9 @@ xchk_directory_leaf1_bestfree( /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { best = be16_to_cpu(*bestp); - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, xfs_dir2_db_to_da(args->geo, i), - XFS_DABUF_MAP_HOLE_OK, - &dbp); + XFS_DABUF_MAP_HOLE_OK, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; @@ -575,7 +736,7 @@ xchk_directory_free_bestfree( int error; /* Read the free space block */ - error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -595,7 +756,7 @@ xchk_directory_free_bestfree( stale++; continue; } - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, (freehdr.firstdb + i) * args->geo->fsbcount, 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, @@ -619,10 +780,11 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args = { - .dp = sc ->ip, + .dp = sc->ip, .whichfork = XFS_DATA_FORK, .geo = sc->mp->m_dir_geo, .trans = sc->tp, + .owner = sc->ip->i_ino, }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; @@ -646,7 +808,8 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - error = xfs_dir2_isblock(&args, &is_block); + if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK) + is_block = true; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -750,16 +913,158 @@ out: return error; } +/* + * Revalidate a dirent that we collected in the past but couldn't check because + * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it + * has gone away on us, or a negative errno. + */ +STATIC int +xchk_dir_revalidate_dirent( + struct xchk_dir *sd, + const struct xfs_name *xname, + xfs_ino_t ino) +{ + struct xfs_scrub *sc = sd->sc; + xfs_ino_t child_ino; + int error; + + /* + * Look up the directory entry. If we get -ENOENT, the directory entry + * went away and there's nothing to revalidate. Return any other + * error. + */ + error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino); + if (error) + return error; + + /* The inode number changed, nothing to revalidate. */ + if (ino != child_ino) + return -ENOENT; + + return 0; +} + +/* + * Check a directory entry's parent pointers the slow way, which means we cycle + * locks a bunch and put up with revalidation until we get it done. + */ +STATIC int +xchk_dir_slow_dirent( + struct xchk_dir *sd, + struct xchk_dirent *dirent, + const struct xfs_name *xname) +{ + struct xfs_scrub *sc = sd->sc; + struct xfs_inode *ip; + unsigned int lockmode; + int error; + + /* Check that the deferred dirent still exists. */ + if (sd->need_revalidate) { + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + } + + error = xchk_iget(sc, dirent->ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged child, we can + * proceed with the validation. + */ + lockmode = xchk_dir_lock_child(sc, ip); + if (lockmode) { + trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino); + goto check_pptr; + } + + /* + * We couldn't lock the child file. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + sd->need_revalidate = true; + + trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode); + if (error) + goto out_rele; + + /* Revalidate, since we just cycled the locks. */ + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + +check_pptr: + error = xchk_dir_parent_pointer(sd, xname, ip); +out_unlock: + xfs_iunlock(ip, lockmode); +out_rele: + xchk_irele(sc, ip); + return error; +} + +/* Check all the dirents that we deferred the first time around. */ +STATIC int +xchk_dir_finish_slow_dirents( + struct xchk_dir *sd) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(sd->dir_entries, array_cur) { + struct xchk_dirent dirent; + + if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(sd->dir_entries, array_cur, &dirent); + if (error) + return error; + + error = xfblob_loadname(sd->dir_names, dirent.name_cookie, + &sd->xname, dirent.namelen); + if (error) + return error; + + error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname); + if (error) + return error; + } + + return 0; +} + /* Scrub a whole directory. */ int xchk_directory( struct xfs_scrub *sc) { + struct xchk_dir *sd; int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -782,9 +1087,89 @@ xchk_directory( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS); + if (!sd) + return -ENOMEM; + sd->sc = sc; + sd->xname.name = sd->namebuf; + + if (xfs_has_parent(sc->mp)) { + char *descr; + + /* + * Set up some staging memory for dirents that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirent), + &sd->dir_entries); + kfree(descr); + if (error) + goto out_sd; + + descr = xchk_xfile_ino_descr(sc, "slow directory entry names"); + error = xfblob_create(descr, &sd->dir_names); + kfree(descr); + if (error) + goto out_entries; + } + /* Look up every name in this directory by hash. */ - error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd); if (error == -ECANCELED) error = 0; - return error; + if (error) + goto out_names; + + if (xfs_has_parent(sc->mp)) { + error = xchk_dir_finish_slow_dirents(sd); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + } + +out_names: + if (sd->dir_names) + xfblob_destroy(sd->dir_names); +out_entries: + if (sd->dir_entries) + xfarray_destroy(sd->dir_entries); +out_sd: + kvfree(sd); + if (error) + return error; + + /* If the dir is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); + return 0; +} + +/* + * Decide if this directory has been zapped to satisfy the inode and ifork + * verifiers. Checking and repairing should be postponed until the directory + * is fixed. + */ +bool +xchk_dir_looks_zapped( + struct xfs_inode *dp) +{ + /* Repair zapped this dir's data fork a short time ago */ + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return true; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the bmapbtd + * scrubber to reconstruct the block mappings. Directories always + * contain some content, so this is a clear sign of a zapped directory. + * The state checked by xfs_ifork_zapped is not persisted, so this is + * the secondary strategy if repairs are interrupted by a crash or an + * unmount. + */ + return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_df.if_nextents == 0; } diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c new file mode 100644 index 000000000000..64679fe08446 --- /dev/null +++ b/fs/xfs/scrub/dir_repair.c @@ -0,0 +1,1958 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_ag.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" +#include "scrub/reap.h" +#include "scrub/findparent.h" +#include "scrub/orphanage.h" +#include "scrub/listxattr.h" + +/* + * Directory Repair + * ================ + * + * We repair directories by reading the directory data blocks looking for + * directory entries that look salvageable (name passes verifiers, entry points + * to a valid allocated inode, etc). Each entry worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * directory to constrain memory use. Batching the construction of the + * temporary directory in this fashion reduces lock cycling of the directory + * being repaired and the temporary directory, and will later become important + * for parent pointer scanning. + * + * If parent pointers are enabled on this filesystem, we instead reconstruct + * the directory by visiting each parent pointer of each file in the filesystem + * and translating the relevant parent pointer records into dirents. In this + * case, it is advantageous to stash all directory entries created from parent + * pointers for a single child file before replaying them into the temporary + * directory. To save memory, the live filesystem scan reuses the findparent + * fields. Directory repair chooses either parent pointer scanning or + * directory entry salvaging, but not both. + * + * Directory entries added to the temporary directory do not elevate the link + * counts of the inodes found. When salvaging completes, the remaining stashed + * entries are replayed to the temporary directory. An atomic mapping exchange + * is used to commit the new directory blocks to the directory being repaired. + * This will disrupt readdir cursors. + * + * Locking Issues + * -------------- + * + * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on + * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects + * b's dotdot update. This is in contrast to every other dotdot update (link, + * remove, mkdir). If the repair code drops the ILOCK, it must either + * revalidate the dotdot entry or use dirent hooks to capture updates from + * other threads. + */ + +/* Create a dirent in the tempdir. */ +#define XREP_DIRENT_ADD (1) + +/* Remove a dirent from the tempdir. */ +#define XREP_DIRENT_REMOVE (2) + +/* Directory entry to be restored in the new directory. */ +struct xrep_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Target inode number. */ + xfs_ino_t ino; + + /* Length of the dirent name. */ + uint8_t namelen; + + /* File type of the dirent. */ + uint8_t ftype; + + /* XREP_DIRENT_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names + * before we write them to the temp dir. + */ +#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_dir { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing directory entry names. */ + struct xfblob *dir_names; + + /* Information for exchanging data forks at the end. */ + struct xrep_tempexch tx; + + /* Preallocated args struct for performing dir operations */ + struct xfs_da_args args; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. For directory salvaging when + * parent pointers are not enabled, we use the findparent_* functions + * on this object and access only the parent_ino field directly. + * + * When parent pointers are enabled, however, the pptr scanner uses the + * iscan, hooks, lock, and parent_ino fields of this object directly. + * @pscan.lock coordinates access to dir_entries, dir_names, + * parent_ino, subdirs, dirents, and args. This reduces the memory + * requirements of this structure. + */ + struct xrep_parent_scan_info pscan; + + /* + * Context information for attaching this directory to the lost+found + * if this directory does not have a parent. + */ + struct xrep_adoption adoption; + + /* How many subdirectories did we find? */ + uint64_t subdirs; + + /* How many dirents did we find? */ + unsigned int dirents; + + /* Should we move this directory to the orphanage? */ + bool needs_adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; +}; + +/* Tear down all the incore stuff we created. */ +static void +xrep_dir_teardown( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + + xrep_findparent_scan_teardown(&rd->pscan); + xfblob_destroy(rd->dir_names); + xfarray_destroy(rd->dir_entries); +} + +/* Set up for a directory repair. */ +int +xrep_setup_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + error = xrep_orphanage_try_create(sc); + if (error) + return error; + + error = xrep_tempfile_create(sc, S_IFDIR); + if (error) + return error; + + rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); + if (!rd) + return -ENOMEM; + rd->sc = sc; + rd->xname.name = rd->namebuf; + sc->buf = rd; + + return 0; +} + +/* + * Look up the dotdot entry and confirm that it's really the parent. + * Returns NULLFSINO if we don't know what to do. + */ +static inline xfs_ino_t +xrep_dir_lookup_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; + int error; + + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); + if (error) + return NULLFSINO; + if (!xfs_verify_dir_ino(sc->mp, ino)) + return NULLFSINO; + + error = xrep_findparent_confirm(sc, &ino); + if (error) + return NULLFSINO; + + return ino; +} + +/* + * Look up '..' in the dentry cache and confirm that it's really the parent. + * Returns NULLFSINO if the dcache misses or if the hit is implausible. + */ +static inline xfs_ino_t +xrep_dir_dcache_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t parent_ino; + int error; + + parent_ino = xrep_findparent_from_dcache(sc); + if (parent_ino == NULLFSINO) + return parent_ino; + + error = xrep_findparent_confirm(sc, &parent_ino); + if (error) + return NULLFSINO; + + return parent_ino; +} + +/* Try to find the parent of the directory being repaired. */ +STATIC int +xrep_dir_find_parent( + struct xrep_dir *rd) +{ + xfs_ino_t ino; + + ino = xrep_findparent_self_reference(rd->sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_dcache_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_lookup_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + /* + * A full filesystem scan is the last resort. On a busy filesystem, + * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means + * that we don't know what who the parent is, so we should return to + * userspace. + */ + return xrep_findparent_scan(&rd->pscan); +} + +/* + * Decide if we want to salvage this entry. We don't bother with oversized + * names or the dot entry. + */ +STATIC int +xrep_dir_want_salvage( + struct xrep_dir *rd, + const char *name, + int namelen, + xfs_ino_t ino) +{ + struct xfs_mount *mp = rd->sc->mp; + + /* No pointers to ourselves or to garbage. */ + if (ino == rd->sc->ip->i_ino) + return false; + if (!xfs_verify_dir_ino(mp, ino)) + return false; + + /* No weird looking names or dot entries. */ + if (namelen >= MAXNAMELEN || namelen <= 0) + return false; + if (namelen == 1 && name[0] == '.') + return false; + if (!xfs_dir2_namecheck(name, namelen)) + return false; + + return true; +} + +/* + * Remember that we want to create a dirent in the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_ADD, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* + * Remember that we want to remove a dirent from the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_REMOVE, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* Allocate an in-core record to hold entries while we rebuild the dir data. */ +STATIC int +xrep_dir_salvage_entry( + struct xrep_dir *rd, + unsigned char *name, + unsigned int namelen, + xfs_ino_t ino) +{ + struct xfs_name xname = { + .name = name, + }; + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this entry. + */ + while (i < namelen && name[i] != 0 && name[i] != '/') + i++; + if (i == 0) + return 0; + xname.len = i; + + /* Ignore '..' entries; we already picked the new parent. */ + if (xname.len == 2 && name[0] == '.' && name[1] == '.') { + trace_xrep_dir_salvaged_parent(sc->ip, ino); + return 0; + } + + trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); + + /* + * Compute the ftype or dump the entry if we can't. We don't lock the + * inode because inodes can't change type while we have a reference. + */ + error = xchk_iget(sc, ino, &ip); + if (error) + return 0; + + xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + xchk_irele(sc, ip); + + return xrep_dir_stash_createname(rd, &xname, ino); +} + +/* Record a shortform directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_sf_entry( + struct xrep_dir *rd, + struct xfs_dir2_sf_hdr *sfp, + struct xfs_dir2_sf_entry *sfep) +{ + xfs_ino_t ino; + + ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); + if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); +} + +/* Record a regular directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_data_entry( + struct xrep_dir *rd, + struct xfs_dir2_data_entry *dep) +{ + xfs_ino_t ino; + + ino = be64_to_cpu(dep->inumber); + if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); +} + +/* Try to recover block/data format directory entries. */ +STATIC int +xrep_dir_recover_data( + struct xrep_dir *rd, + struct xfs_buf *bp) +{ + struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; + unsigned int offset; + unsigned int end; + int error = 0; + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + offset = geo->data_entry_offset; + end = min_t(unsigned int, BBTOB(bp->b_length), + xfs_dir3_data_end_offset(geo, bp->b_addr)); + + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + + if (xchk_should_terminate(rd->sc, &error)) + return error; + + /* Skip unused entries. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + offset += be16_to_cpu(dup->length); + continue; + } + + /* Don't walk off the end of the block. */ + offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); + if (offset > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_data_entry(rd, dep); + if (error) + return error; + + } + + return 0; +} + +/* Try to recover shortform directory entries. */ +STATIC int +xrep_dir_recover_sf( + struct xrep_dir *rd) +{ + struct xfs_dir2_sf_hdr *hdr; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next; + struct xfs_ifork *ifp; + xfs_ino_t ino; + unsigned char *end; + int error = 0; + + ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); + hdr = ifp->if_data; + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + + ino = xfs_dir2_sf_get_parent_ino(hdr); + trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); + + sfep = xfs_dir2_sf_firstentry(hdr); + while ((unsigned char *)sfep < end) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); + if ((unsigned char *)next > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); + if (error) + return error; + + sfep = next; + } + + return 0; +} + +/* + * Try to figure out the format of this directory from the data fork mappings + * and the directory size. If we can be reasonably sure of format, we can be + * more aggressive in salvaging directory entries. On return, @magic_guess + * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" + * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, + * and 0 if we can't tell. + */ +STATIC void +xrep_dir_guess_format( + struct xrep_dir *rd, + __be32 *magic_guess) +{ + struct xfs_inode *dp = rd->sc->ip; + struct xfs_mount *mp = rd->sc->mp; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t last; + int error; + + ASSERT(xfs_has_crc(mp)); + + *magic_guess = 0; + + /* + * If there's a single directory block and the directory size is + * exactly one block, this has to be a single block format directory. + */ + error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); + if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && + dp->i_disk_size == geo->blksize) { + *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return; + } + + /* + * If the last extent before the leaf offset matches the directory + * size and the directory size is larger than 1 block, this is a + * data format directory. + */ + last = geo->leafblk; + error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); + if (!error && + XFS_FSB_TO_B(mp, last) > geo->blksize && + XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { + *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + return; + } +} + +/* Recover directory entries from a specific directory block. */ +STATIC int +xrep_dir_recover_dirblock( + struct xrep_dir *rd, + __be32 magic_guess, + xfs_dablk_t dabno) +{ + struct xfs_dir2_data_hdr *hdr; + struct xfs_buf *bp; + __be32 oldmagic; + int error; + + /* + * Try to read buffer. We invalidate them in the next step so we don't + * bother to set a buffer type or ops. + */ + error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, + XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); + if (error || !bp) + return error; + + hdr = bp->b_addr; + oldmagic = hdr->magic; + + trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, + be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); + + /* + * If we're sure of the block's format, proceed with the salvage + * operation using the specified magic number. + */ + if (magic_guess) { + hdr->magic = magic_guess; + goto recover; + } + + /* + * If we couldn't guess what type of directory this is, then we will + * only salvage entries from directory blocks that match the magic + * number and pass verifiers. + */ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) + goto out; + if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) + goto out; + if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + default: + goto out; + } + +recover: + error = xrep_dir_recover_data(rd, bp); + +out: + hdr->magic = oldmagic; + xfs_trans_brelse(rd->sc->tp, bp); + return error; +} + +static inline void +xrep_dir_init_args( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + memset(&rd->args, 0, sizeof(struct xfs_da_args)); + rd->args.geo = rd->sc->mp->m_dir_geo; + rd->args.whichfork = XFS_DATA_FORK; + rd->args.owner = rd->sc->ip->i_ino; + rd->args.trans = rd->sc->tp; + rd->args.dp = dp; + if (!name) + return; + rd->args.name = name->name; + rd->args.namelen = name->len; + rd->args.filetype = name->type; + rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); +} + +/* Replay a stashed createname into the temporary directory. */ +STATIC int +xrep_dir_replay_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = rd->sc->tempip; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + trace_xrep_dir_replay_createname(dp, name, inum); + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + return xfs_dir_createname_args(&rd->args); +} + +/* Replay a stashed removename onto the temporary directory. */ +STATIC int +xrep_dir_replay_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_extlen_t total) +{ + struct xfs_inode *dp = rd->args.dp; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + xrep_dir_init_args(rd, dp, name); + rd->args.op_flags = 0; + rd->args.total = total; + + trace_xrep_dir_replay_removename(dp, name, 0); + return xfs_dir_removename_args(&rd->args); +} + +/* + * Add this stashed incore directory entry to the temporary directory. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_dir_replay_update( + struct xrep_dir *rd, + const struct xfs_name *xname, + const struct xrep_dirent *dirent) +{ + struct xfs_mount *mp = rd->sc->mp; +#ifdef DEBUG + xfs_ino_t ino; +#endif + uint resblks; + int error; + + resblks = xfs_link_space_res(mp, xname->len); + error = xchk_trans_alloc(rd->sc, resblks); + if (error) + return error; + + /* Lock the temporary directory and join it to the transaction */ + xrep_tempfile_ilock(rd->sc); + xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); + + switch (dirent->action) { + case XREP_DIRENT_ADD: + /* + * Create a replacement dirent in the temporary directory. + * Note that _createname doesn't check for existing entries. + * There shouldn't be any in the temporary dir, but we'll + * verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error != -ENOENT) { + ASSERT(error != -ENOENT); + goto out_cancel; + } +#endif + + error = xrep_dir_replay_createname(rd, xname, dirent->ino, + resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs++; + rd->dirents++; + break; + case XREP_DIRENT_REMOVE: + /* + * Remove a dirent from the temporary directory. Note that + * _removename doesn't check the inode target of the exist + * entry. There should be a perfect match in the temporary + * dir, but we'll verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error) { + ASSERT(error != 0); + goto out_cancel; + } + if (ino != dirent->ino) { + ASSERT(ino == dirent->ino); + error = -EIO; + goto out_cancel; + } +#endif + + error = xrep_dir_replay_removename(rd, xname, resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs--; + rd->dirents--; + break; + default: + ASSERT(0); + error = -EIO; + goto out_cancel; + } + + /* Commit and unlock. */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + + xrep_tempfile_iunlock(rd->sc); + return 0; +out_cancel: + xchk_trans_cancel(rd->sc); + xrep_tempfile_iunlock(rd->sc); + return error; +} + +/* + * Flush stashed incore dirent updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the directory rebuild, + * since directories can contain up to 32GB of directory data. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir + * IOLOCK. + */ +STATIC int +xrep_dir_replay_updates( + struct xrep_dir *rd) +{ + xfarray_idx_t array_cur; + int error; + + /* Add all the salvaged dirents to the temporary directory. */ + mutex_lock(&rd->pscan.lock); + foreach_xfarray_idx(rd->dir_entries, array_cur) { + struct xrep_dirent dirent; + + error = xfarray_load(rd->dir_entries, array_cur, &dirent); + if (error) + goto out_unlock; + + error = xfblob_loadname(rd->dir_names, dirent.name_cookie, + &rd->xname, dirent.namelen); + if (error) + goto out_unlock; + rd->xname.type = dirent.ftype; + mutex_unlock(&rd->pscan.lock); + + error = xrep_dir_replay_update(rd, &rd->xname, &dirent); + if (error) + return error; + mutex_lock(&rd->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rd->dir_entries); + xfblob_truncate(rd->dir_names); + mutex_unlock(&rd->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * Periodically flush stashed directory entries to the temporary dir. This + * is done to reduce the memory requirements of the directory rebuild, since + * directories can contain up to 32GB of directory data. + */ +STATIC int +xrep_dir_flush_stashed( + struct xrep_dir *rd) +{ + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during dirent salvaging to avoid livelocking if there + * are cycles in the directory structures. We hold ILOCK_EXCL on both + * the inode being repaired and the temporary file, though they are + * not ijoined to the scrub transaction. + * + * To constrain kernel memory use, we occasionally write salvaged + * dirents from the xfarray and xfblob structures into the temporary + * directory in preparation for exchanging the directory structures at + * the end. Updating the temporary file requires a transaction, so we + * commit the scrub transaction and drop the two ILOCKs so that + * we can allocate whatever transaction we want. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from accessing the damaged directory data while we + * repair it. + */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify dirents. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* Write to the tempdir all the updates that we've stashed. */ + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(rd->sc); + if (error) + return error; + + /* + * Recreate the salvage transaction and relock the dir we're salvaging. + */ + error = xchk_trans_alloc(rd->sc, 0); + if (error) + return error; + xchk_ilock(rd->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much dirent data in memory. */ +static inline bool +xrep_dir_want_flush_stashed( + struct xrep_dir *rd) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); + return bytes > XREP_DIR_MAX_STASH_BYTES; +} + +/* Extract as many directory entries as we can. */ +STATIC int +xrep_dir_recover( + struct xrep_dir *rd) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rd->sc; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + xfs_fileoff_t offset; + xfs_dablk_t dabno; + __be32 magic_guess; + int nmap; + int error; + + xrep_dir_guess_format(rd, &magic_guess); + + /* Iterate each directory data block in the data fork. */ + for (offset = 0; + offset < geo->leafblk; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, + &got, &nmap, 0); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += geo->fsbcount) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + error = xrep_dir_recover_dirblock(rd, + magic_guess, dabno); + if (error) + return error; + + /* Flush dirents to constrain memory usage. */ + if (xrep_dir_want_flush_stashed(rd)) { + error = xrep_dir_flush_stashed(rd); + if (error) + return error; + } + } + } + + return 0; +} + +/* + * Find all the directory entries for this inode by scraping them out of the + * directory leaf blocks by hand, and flushing them into the temp dir. + */ +STATIC int +xrep_dir_find_entries( + struct xrep_dir *rd) +{ + struct xfs_inode *dp = rd->sc->ip; + int error; + + /* + * Salvage directory entries from the old directory, and write them to + * the temporary directory. + */ + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_dir_recover_sf(rd); + } else { + error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xrep_dir_recover(rd); + } + if (error) + return error; + + return xrep_dir_flush_stashed(rd); +} + +/* Scan all files in the filesystem for dirents. */ +STATIC int +xrep_dir_salvage_entries( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + /* + * Drop the ILOCK on this directory so that we can scan for this + * directory's parent. Figure out who is going to be the parent of + * this directory, then retake the ILOCK so that we can salvage + * directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + error = xrep_dir_find_parent(rd); + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* + * Collect directory entries by parsing raw leaf blocks to salvage + * whatever we can. When we're done, free the staging memory before + * exchanging the directories to reduce memory usage. + */ + error = xrep_dir_find_entries(rd); + if (error) + return error; + + /* + * Cancel the repair transaction and drop the ILOCK so that we can + * (later) use the atomic mapping exchange functions to compute the + * correct block reservations and re-lock the inodes. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory + * modifications, but there's nothing to prevent userspace from reading + * the directory until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + * + * The VFS can change dotdot on us, but the findparent scan will keep + * our incore parent inode up to date. See the note on locking issues + * for more details. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Examine a parent pointer of a file. If it leads us back to the directory + * that we're rebuilding, create an incore dirent from the parent pointer and + * stash it. + */ +STATIC int +xrep_dir_scan_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), + }; + xfs_ino_t parent_ino; + uint32_t parent_gen; + struct xrep_dir *rd = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + /* + * Ignore parent pointers that point back to a different dir, list the + * wrong generation number, or are invalid. + */ + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, &parent_gen); + if (error) + return error; + + if (parent_ino != sc->ip->i_ino || + parent_gen != VFS_I(sc->ip)->i_generation) + return 0; + + mutex_lock(&rd->pscan.lock); + error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * If this child dirent points to the directory being repaired, remember that + * fact so that we can reset the dotdot entry if necessary. + */ +STATIC int +xrep_dir_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_dir *rd = priv; + + /* Dirent doesn't point to this directory. */ + if (ino != rd->sc->ip->i_ino) + return 0; + + /* Ignore garbage inum. */ + if (!xfs_verify_dir_ino(rd->sc->mp, ino)) + return 0; + + /* No weird looking names. */ + if (name->len >= MAXNAMELEN || name->len <= 0) + return 0; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, + dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, dp->i_ino); + return 0; +} + +/* + * Decide if we want to look for child dirents or parent pointers in this file. + * Skip the dir being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_dir_want_scan( + struct xrep_dir *rd, + const struct xfs_inode *ip) +{ + return ip != rd->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or + * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_dir_scan_ilock( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_dir_want_scan(rd, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents or parent pointers that point to + * the directory we're rebuilding. + */ +STATIC int +xrep_dir_scan_file( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_dir_scan_ilock(rd, ip); + + if (!xrep_dir_want_scan(rd, ip)) + goto scan_done; + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); + if (error) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rd->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* + * Scan all files in the filesystem for parent pointers that we can turn into + * replacement dirents, and a dirent that we can use to set the dotdot pointer. + */ +STATIC int +xrep_dir_scan_dirtree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + int error; + + /* Roots of directory trees are their own parents. */ + if (sc->ip == sc->mp->m_rootip) + xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); + + /* + * Filesystem scans are time consuming. Drop the directory ILOCK and + * all other resources for the duration of the scan and hope for the + * best. The live update hooks will keep our scan information up to + * date even though we've dropped the locks. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_dir_scan_file(rd, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed dirent updates to constrain memory usage. */ + mutex_lock(&rd->pscan.lock); + flush = xrep_dir_want_flush_stashed(rd); + mutex_unlock(&rd->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rd->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Cancel the empty transaction so that we can (later) use the atomic + * file mapping exchange functions to lock files and commit the new + * directory. + */ + xchk_trans_cancel(rd->sc); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * directory being repaired. + */ +STATIC int +xrep_dir_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_dir *rd; + struct xfs_scrub *sc; + int error = 0; + + rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); + sc = rd->sc; + + /* + * This thread updated a child dirent in the directory that we're + * rebuilding. Stash the update for replay against the temporary + * directory. + */ + if (p->dp->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { + mutex_lock(&rd->pscan.lock); + if (p->delta > 0) + error = xrep_dir_stash_createname(rd, p->name, + p->ip->i_ino); + else + error = xrep_dir_stash_removename(rd, p->name, + p->ip->i_ino); + mutex_unlock(&rd->pscan.lock); + if (error) + goto out_abort; + } + + /* + * This thread updated another directory's child dirent that points to + * the directory that we're rebuilding, so remember the new dotdot + * target. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { + if (p->delta > 0) { + trace_xrep_dir_stash_createname(sc->tempip, + &xfs_name_dotdot, + p->dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); + } else { + trace_xrep_dir_stash_removename(sc->tempip, + &xfs_name_dotdot, + rd->pscan.parent_ino); + + xrep_findparent_scan_found(&rd->pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rd->pscan.iscan); + return NOTIFY_DONE; +} + +/* + * Free all the directory blocks and reset the data fork. The caller must + * join the inode to the transaction. This function returns with the inode + * joined to a clean scrub transaction. + */ +STATIC int +xrep_dir_reset_fork( + struct xrep_dir *rd, + xfs_ino_t parent_ino) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the directory buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_dir_reset_fork(sc->tempip, parent_ino); + + /* Reset the data fork to an empty data fork. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + sc->tempip->i_disk_size = 0; + + /* Reinitialize the short form directory. */ + xrep_dir_init_args(rd, sc->tempip, NULL); + return xfs_dir2_sf_create(&rd->args, parent_ino); +} + +/* + * Prepare both inodes' directory forks for exchanging mappings. Promote the + * tempfile from short format to leaf format, and if the file being repaired + * has a short format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_dir_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's directory is in shortform format, convert that to + * a single leaf extent so that we can use the atomic mapping exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_dir2_sf_to_block(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* + * Replace the inode number of a directory entry. + */ +static int +xrep_dir_replace( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + return xfs_dir_replace_args(&rd->args); +} + +/* + * Reset the link count of this directory and adjust the unlinked list pointers + * as needed. + */ +STATIC int +xrep_dir_set_nlink( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = sc->ip; + struct xfs_perag *pag; + unsigned int new_nlink = min_t(unsigned long long, + rd->subdirs + 2, + XFS_NLINK_PINNED); + int error; + + /* + * The directory is not on the incore unlinked list, which means that + * it needs to be reachable via the directory tree. Update the nlink + * with our observed link count. If the directory has no parent, it + * will be moved to the orphanage. + */ + if (!xfs_inode_on_unlinked_list(dp)) + goto reset_nlink; + + /* + * The directory is on the unlinked list and we did not find any + * dirents. Set the link count to zero and let the directory + * inactivate when the last reference drops. + */ + if (rd->dirents == 0) { + rd->needs_adoption = false; + new_nlink = 0; + goto reset_nlink; + } + + /* + * The directory is on the unlinked list and we found dirents. This + * directory needs to be reachable via the directory tree. Remove the + * dir from the unlinked list and update nlink with the observed link + * count. If the directory has no parent, it will be moved to the + * orphanage. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, dp); + xfs_perag_put(pag); + if (error) + return error; + +reset_nlink: + if (VFS_I(dp)->i_nlink != new_nlink) + set_nlink(VFS_I(dp), new_nlink); + return 0; +} + +/* + * Finish replaying stashed dirent updates, allocate a transaction for + * exchanging data fork mappings, and take the ILOCKs of both directories + * before we commit the new directory structure. + */ +STATIC int +xrep_dir_finalize_tempdir( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible dirent updates. + * Replay all queued dirent updates into the tempdir before exchanging + * the contents, even if that means dropping the ILOCKs and the + * transaction. + */ + do { + error = xrep_dir_replay_updates(rd); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + if (error) + return error; + + if (xfarray_length(rd->dir_entries) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* Exchange the temporary directory's data fork with the one being repaired. */ +STATIC int +xrep_dir_swap( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + bool ip_local, temp_local; + int error = 0; + + /* + * If we never found the parent for this directory, temporarily assign + * the root dir as the parent; we'll move this to the orphanage after + * exchanging the dir contents. We hold the ILOCK of the dir being + * repaired, so we're not worried about racy updates of dotdot. + */ + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + if (rd->pscan.parent_ino == NULLFSINO) { + rd->needs_adoption = true; + rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; + } + + /* + * Reset the temporary directory's '..' entry to point to the parent + * that we found. The temporary directory was created with the root + * directory as the parent, so we can skip this if repairing a + * subdirectory of the root. + * + * It's also possible that this replacement could also expand a sf + * tempdir into block format. + */ + if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) { + error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, + rd->pscan.parent_ino, rd->tx.req.resblks); + if (error) + return error; + } + + /* + * Changing the dot and dotdot entries could have changed the shape of + * the directory, so we recompute these. + */ + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format data fork and the rebuilt + * directory data would fit in the repaired file's data fork, copy + * the contents from the tempfile and update the directory link count. + * We're done now. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return xrep_dir_set_nlink(rd); + } + + /* + * Clean the transaction before we start working on exchanging + * directory contents. + */ + error = xrep_tempfile_roll_trans(rd->sc); + if (error) + return error; + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_dir_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + /* + * Set nlink of the directory in the same transaction sequence that + * (atomically) commits the new directory data. + */ + error = xrep_dir_set_nlink(rd); + if (error) + return error; + + return xrep_tempexch_contents(sc, &rd->tx); +} + +/* + * Exchange the new directory contents (which we created in the tempfile) with + * the directory being repaired. + */ +STATIC int +xrep_dir_rebuild_tree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); + + /* + * Take the IOLOCK on the temporary file so that we can run dir + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed dirent updates to the tempdir. After this point, + * we're ready to exchange data fork mappings. + */ + error = xrep_dir_finalize_tempdir(rd); + if (error) + return error; + + if (xchk_iscan_aborted(&rd->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the tempdir's data fork with the file being repaired. This + * recreates the transaction and re-takes the ILOCK in the scrub + * context. + */ + error = xrep_dir_swap(rd); + if (error) + return error; + + /* + * Release the old directory blocks and reset the data fork of the temp + * directory to an empty shortform directory because inactivation does + * nothing for directories. + */ + error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target directory. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + return 0; +} + +/* Set up the filesystem scan so we can regenerate directory entries. */ +STATIC int +xrep_dir_setup_scan( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + char *descr; + int error; + + /* Set up some staging memory for salvaging dirents. */ + descr = xchk_xfile_ino_descr(sc, "directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_dirent), + &rd->dir_entries); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ino_descr(sc, "directory entry names"); + error = xfblob_create(descr, &rd->dir_names); + kfree(descr); + if (error) + goto out_xfarray; + + if (xfs_has_parent(sc->mp)) + error = __xrep_findparent_scan_start(sc, &rd->pscan, + xrep_dir_live_update); + else + error = xrep_findparent_scan_start(sc, &rd->pscan); + if (error) + goto out_xfblob; + + return 0; + +out_xfblob: + xfblob_destroy(rd->dir_names); + rd->dir_names = NULL; +out_xfarray: + xfarray_destroy(rd->dir_entries); + rd->dir_entries = NULL; + return error; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_dir_move_to_orphanage( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + /* + * We are about to drop the ILOCK on sc->ip to lock the orphanage and + * prepare for the adoption. Therefore, look up the old dotdot entry + * for sc->ip so that we can compare it after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); + if (error) + return error; + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rd->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rd->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rd->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Repair the directory metadata. + * + * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer + * cache in XFS can't handle aliased multiblock buffers, so this might + * misbehave if the directory blocks are crosslinked with other filesystem + * metadata. + * + * XXX: Is it necessary to check the dcache for this directory to make sure + * that we always recreate every cached entry? + */ +int +xrep_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_dir_setup_scan(rd); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_dir_scan_dirtree(rd); + else + error = xrep_dir_salvage_entries(rd); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_dir_rebuild_tree(rd); + if (error) + goto out_teardown; + + if (rd->needs_adoption) { + if (!xrep_orphanage_can_adopt(rd->sc)) + error = -EFSCORRUPTED; + else + error = xrep_dir_move_to_orphanage(rd); + if (error) + goto out_teardown; + } + +out_teardown: + xrep_dir_teardown(sc); + return error; +} diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c new file mode 100644 index 000000000000..bde58fb561ea --- /dev/null +++ b/fs/xfs/scrub/dirtree.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" + +/* + * Directory Tree Structure Validation + * =================================== + * + * Validating the tree qualities of the directory tree structure can be + * difficult. If the tree is frozen, running a depth (or breadth) first search + * and marking a bitmap suffices to determine if there is a cycle. XORing the + * mark bitmap with the inode bitmap afterwards tells us if there are + * disconnected cycles. If the tree is not frozen, directory updates can move + * subtrees across the scanner wavefront, which complicates the design greatly. + * + * Directory parent pointers change that by enabling an incremental approach to + * validation of the tree structure. Instead of using one thread to scan the + * entire filesystem, we instead can have multiple threads walking individual + * subdirectories upwards to the root. In a perfect world, the IOLOCK would + * suffice to stabilize two directories in a parent -> child relationship. + * Unfortunately, the VFS does not take the IOLOCK when moving a child + * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks + * to detect a race. If a race occurs in a path, we restart the scan. + * + * If the walk terminates without reaching the root, we know the path is + * disconnected and ought to be attached to the lost and found. If on the walk + * we find the same subdir that we're scanning, we know this is a cycle and + * should delete an incoming edge. If we find multiple paths to the root, we + * know to delete an incoming edge. + * + * There are two big hitches with this approach: first, all file link counts + * must be correct to prevent other writers from doing the wrong thing with the + * directory tree structure. Second, because we're walking upwards in a tree + * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a + * directory update hook to invalidate the scan results if one of the paths + * we've scanned has changed. + */ + +/* Clean up the dirtree checking resources. */ +STATIC void +xchk_dirtree_buf_cleanup( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + if (dl->scan_ino != NULLFSINO) + xfs_dir_hook_del(dl->sc->mp, &dl->dhook); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + + xfblob_destroy(dl->path_names); + xfarray_destroy(dl->path_steps); + mutex_destroy(&dl->lock); +} + +/* Set us up to look for directory loops. */ +int +xchk_setup_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + if (xchk_could_repair(sc)) { + error = xrep_setup_dirtree(sc); + if (error) + return error; + } + + dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS); + if (!dl) + return -ENOMEM; + dl->sc = sc; + dl->xname.name = dl->namebuf; + dl->hook_xname.name = dl->hook_namebuf; + INIT_LIST_HEAD(&dl->path_list); + dl->root_ino = NULLFSINO; + dl->scan_ino = NULLFSINO; + dl->parent_ino = NULLFSINO; + + mutex_init(&dl->lock); + + descr = xchk_xfile_ino_descr(sc, "dirtree path steps"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step), + &dl->path_steps); + kfree(descr); + if (error) + goto out_dl; + + descr = xchk_xfile_ino_descr(sc, "dirtree path names"); + error = xfblob_create(descr, &dl->path_names); + kfree(descr); + if (error) + goto out_steps; + + error = xchk_setup_inode_contents(sc, 0); + if (error) + goto out_names; + + sc->buf = dl; + sc->buf_cleanup = xchk_dirtree_buf_cleanup; + return 0; + +out_names: + xfblob_destroy(dl->path_names); +out_steps: + xfarray_destroy(dl->path_steps); +out_dl: + mutex_destroy(&dl->lock); + kvfree(dl); + return error; +} + +/* + * Add the parent pointer described by @dl->pptr to the given path as a new + * step. Returns -ELNRNG if the path is too deep. + */ +int +xchk_dirpath_append( + struct xchk_dirtree *dl, + struct xfs_inode *ip, + struct xchk_dirpath *path, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr) +{ + struct xchk_dirpath_step step = { + .pptr_rec = *pptr, /* struct copy */ + .name_len = name->len, + }; + int error; + + /* + * If this path is more than 2 billion steps long, this directory tree + * is too far gone to fix. + */ + if (path->nr_steps >= XFS_MAXLINK) + return -ELNRNG; + + error = xfblob_storename(dl->path_names, &step.name_cookie, name); + if (error) + return error; + + error = xino_bitmap_set(&path->seen_inodes, ip->i_ino); + if (error) + return error; + + error = xfarray_append(dl->path_steps, &step); + if (error) + return error; + + path->nr_steps++; + return 0; +} + +/* + * Create an xchk_path for each parent pointer of the directory that we're + * scanning. For each path created, we will eventually try to walk towards the + * root with the goal of deleting all parents except for one that leads to the + * root. + * + * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt + * parent pointer and hence there's no point in continuing; or -ENOSR if there + * are too many parent pointers for this directory. + */ +STATIC int +xchk_dirtree_create_path( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_dirtree *dl = priv; + struct xchk_dirpath *path; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If there are more than 2 billion actual parent pointers for this + * subdirectory, this fs is too far gone to fix. + */ + if (dl->nr_paths >= XFS_MAXLINK) + return -ENOSR; + + trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec); + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XCHK_DIRPATH_SCANNING; + + error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; +out_path: + kfree(path); + return error; +} + +/* + * Validate that the first step of this path still has a corresponding + * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while + * walking towards the roots, which is why this is necessary. + * + * This function has a side effect of loading the first parent pointer of this + * path into the parent pointer scratch pad. This prepares us to walk up the + * directory tree towards the root. Returns -ESTALE if the scan data is now + * out of date. + */ +STATIC int +xchk_dirpath_revalidate( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Look up the parent pointer that corresponds to the start of this + * path. If the parent pointer has disappeared on us, dump all the + * scan results and try again. + */ + error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec, + &dl->pptr_args); + if (error == -ENOATTR) { + trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr, + path->first_step, &dl->xname, &dl->pptr_rec); + dl->stale = true; + return -ESTALE; + } + + return error; +} + +/* + * Walk the parent pointers of a directory at the end of a path and record + * the parent that we find in @dl->xname/pptr_rec. + */ +STATIC int +xchk_dirpath_find_next_step( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_dirtree *dl = priv; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If we've already set @dl->pptr_rec, then this directory has multiple + * parents. Signal this back to the caller via -EMLINK. + */ + if (dl->parents_found > 0) + return -EMLINK; + + dl->parents_found++; + memcpy(dl->namebuf, name, namelen); + dl->xname.len = namelen; + dl->pptr_rec = *rec; /* struct copy */ + return 0; +} + +/* Set and log the outcome of a path walk. */ +static inline void +xchk_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* + * Scan the directory at the end of this path for its parent directory link. + * If we find one, extend the path. Returns -ESTALE if the scan data out of + * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if + * the path got too deep. + */ +STATIC int +xchk_dirpath_step_up( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino); + unsigned int lock_mode; + int error; + + /* Grab and lock the parent directory. */ + error = xchk_iget(sc, parent_ino, &dp); + if (error) + return error; + + lock_mode = xfs_ilock_attr_map_shared(dp); + mutex_lock(&dl->lock); + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + /* We've reached the root directory; the path is ok. */ + if (parent_ino == dl->root_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK); + error = 0; + goto out_scanlock; + } + + /* + * The inode being scanned is its own distant ancestor! Get rid of + * this path. + */ + if (parent_ino == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + error = 0; + goto out_scanlock; + } + + /* + * We've seen this inode before during the path walk. There's a loop + * above us in the directory tree. This probably means that we cannot + * continue, but let's keep walking paths to get a full picture. + */ + if (xino_bitmap_test(&path->seen_inodes, parent_ino)) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP); + error = 0; + goto out_scanlock; + } + + /* The handle encoded in the parent pointer must match. */ + if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) { + trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent pointer must point up to a directory. */ + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent cannot be an unlinked directory. */ + if (VFS_I(dp)->i_nlink == 0) { + trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* + * Walk the parent pointers of @dp to find the parent of this directory + * to find the next step in our walk. If we find that @dp has exactly + * one parent, the parent pointer information will be stored in + * @dl->pptr_rec. This prepares us for the next step of the walk. + */ + mutex_unlock(&dl->lock); + dl->parents_found = 0; + error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED || error == -EMLINK || + (!error && dl->parents_found == 0)) { + /* + * Further up the directory tree from @sc->ip, we found a + * corrupt parent pointer, multiple parent pointers while + * finding this directory's parent, or zero parents despite + * having a nonzero link count. Keep looking for other paths. + */ + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + + /* Append to the path steps */ + error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec); + if (error) + goto out_scanlock; + + if (path->second_step == XFARRAY_NULLIDX) + path->second_step = xfarray_length(dl->path_steps) - 1; + +out_scanlock: + mutex_unlock(&dl->lock); + xfs_iunlock(dp, lock_mode); + xchk_irele(sc, dp); + return error; +} + +/* + * Walk the directory tree upwards towards what is hopefully the root + * directory, recording path steps as we go. The current path components are + * stored in dl->pptr_rec and dl->xname. + * + * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED + * only if the direct parent pointer of @sc->ip associated with this path is + * corrupt. + */ +STATIC int +xchk_dirpath_walk_upwards( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + /* Reload the start of this path and make sure it's still there. */ + error = xchk_dirpath_revalidate(dl, path); + if (error) + return error; + + trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname, + &dl->pptr_rec); + + /* + * The inode being scanned is its own direct ancestor! + * Get rid of this path. + */ + if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + return 0; + } + + /* + * Drop ILOCK_EXCL on the inode being scanned. We still hold + * IOLOCK_EXCL on it, so it cannot move around or be renamed. + * + * Beyond this point we're walking up the directory tree, which means + * that we can acquire and drop the ILOCK on an alias of sc->ip. The + * ILOCK state is no longer tracked in the scrub context. Hence we + * must drop @sc->ip's ILOCK during the walk. + */ + mutex_unlock(&dl->lock); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the first step in the walk towards the root by checking the + * start of this path, which is a direct parent pointer of @sc->ip. + * If we see any kind of error here (including corruptions), the parent + * pointer of @sc->ip is corrupt. Stop the whole scan. + */ + error = xchk_dirpath_step_up(dl, path); + if (error) { + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + return error; + } + + /* + * Take steps upward from the second step in this path towards the + * root. If we hit corruption errors here, there's a problem + * *somewhere* in the path, but we don't need to stop scanning. + */ + while (!error && path->outcome == XCHK_DIRPATH_SCANNING) + error = xchk_dirpath_step_up(dl, path); + + /* Retake the locks we had, mark paths, etc. */ + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + } + if (!error && dl->stale) + return -ESTALE; + return error; +} + +/* + * Decide if this path step has been touched by this live update. Returns + * 1 for yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_step_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + unsigned int step_nr, + xfarray_idx_t step_idx, + struct xfs_dir_update_params *p, + xfs_ino_t *cursor) +{ + struct xchk_dirpath_step step; + xfs_ino_t child_ino = *cursor; + int error; + + error = xfarray_load(dl->path_steps, step_idx, &step); + if (error) + return error; + *cursor = be64_to_cpu(step.pptr_rec.p_ino); + + /* + * If the parent and child being updated are not the ones mentioned in + * this path step, the scan data is still ok. + */ + if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor) + return 0; + + /* + * If the dirent name lengths or byte sequences are different, the scan + * data is still ok. + */ + if (p->name->len != step.name_len) + return 0; + + error = xfblob_loadname(dl->path_names, step.name_cookie, + &dl->hook_xname, step.name_len); + if (error) + return error; + + if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0) + return 0; + + /* + * If the update comes from the repair code itself, walk the state + * machine forward. + */ + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_ADOPTING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED); + return 0; + } + + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_DELETING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED); + return 0; + } + + /* Exact match, scan data is out of date. */ + trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp, + p->ip, p->name); + return 1; +} + +/* + * Decide if this path has been touched by this live update. Returns 1 for + * yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xfs_dir_update_params *p) +{ + xfs_ino_t cursor = dl->scan_ino; + xfarray_idx_t idx = path->first_step; + unsigned int i; + int ret; + + /* + * The child being updated has not been seen by this path at all; this + * path cannot be stale. + */ + if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino)) + return 0; + + ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor); + if (ret != 0) + return ret; + + for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) { + ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor); + if (ret != 0) + return ret; + } + + return 0; +} + +/* + * Decide if a directory update from the regular filesystem touches any of the + * paths we've scanned, and invalidate the scan data if true. + */ +STATIC int +xchk_dirtree_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_dirtree *dl; + struct xchk_dirpath *path; + int ret; + + dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb); + + trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta, + p->name); + + mutex_lock(&dl->lock); + + if (dl->stale || dl->aborted) + goto out_unlock; + + xchk_dirtree_for_each_path(dl, path) { + ret = xchk_dirpath_is_stale(dl, path, p); + if (ret < 0) { + dl->aborted = true; + break; + } + if (ret == 1) { + dl->stale = true; + break; + } + } + +out_unlock: + mutex_unlock(&dl->lock); + return NOTIFY_DONE; +} + +/* Delete all the collected path information. */ +STATIC void +xchk_dirtree_reset( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + dl->nr_paths = 0; + + xfarray_truncate(dl->path_steps); + xfblob_truncate(dl->path_names); + + dl->stale = false; +} + +/* + * Load the name/pptr from the first step in this path into @dl->pptr_rec and + * @dl->xname. + */ +STATIC int +xchk_dirtree_load_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname, + step.name_len); + if (error) + return error; + + dl->pptr_rec = step.pptr_rec; /* struct copy */ + return 0; +} + +/* + * For each parent pointer of this subdir, trace a path upwards towards the + * root directory and record what we find. Returns 0 for success; + * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a + * path was too deep; -ENOSR if there were too many parent pointers; or + * a negative errno. + */ +int +xchk_dirtree_find_paths_to_root( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error = 0; + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + xchk_dirtree_reset(dl); + + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(sc->ip)) { + xchk_set_incomplete(sc); + return -EBUSY; + } + + /* + * Create path walk contexts for each parent of the directory + * that is being scanned. Directories are supposed to have + * only one parent, but this is how we detect multiple parents. + */ + error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path, + NULL, dl); + if (error) + return error; + + xchk_dirtree_for_each_path(dl, path) { + /* Load path components into dl->pptr/xname */ + error = xchk_dirtree_load_path(dl, path); + if (error) + return error; + + /* + * Try to walk up each path to the root. This enables + * us to find directory loops in ancestors, and the + * like. + */ + error = xchk_dirpath_walk_upwards(dl, path); + if (error == -EFSCORRUPTED) { + /* + * A parent pointer of @sc->ip is bad, don't + * bother continuing. + */ + break; + } + if (error == -ESTALE) { + /* This had better be an invalidation. */ + ASSERT(dl->stale); + break; + } + if (error) + return error; + if (dl->aborted) + return 0; + } + } while (dl->stale); + + return error; +} + +/* + * Figure out what to do with the paths we tried to find. Do not call this + * if the scan results are stale. + */ +void +xchk_dirtree_evaluate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + ASSERT(!dl->stale); + + /* Scan the paths we have to decide what to do. */ + memset(oc, 0, sizeof(struct xchk_dirtree_outcomes)); + xchk_dirtree_for_each_path(dl, path) { + trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr, + path->nr_steps, path->outcome); + + switch (path->outcome) { + case XCHK_DIRPATH_SCANNING: + /* shouldn't get here */ + ASSERT(0); + break; + case XCHK_DIRPATH_DELETE: + /* This one is already going away. */ + oc->bad++; + break; + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + /* Couldn't find the end of this path. */ + oc->suspect++; + break; + case XCHK_DIRPATH_STALE: + /* shouldn't get here either */ + ASSERT(0); + break; + case XCHK_DIRPATH_OK: + /* This path got all the way to the root. */ + oc->good++; + break; + case XREP_DIRPATH_DELETING: + case XREP_DIRPATH_DELETED: + case XREP_DIRPATH_ADOPTING: + case XREP_DIRPATH_ADOPTED: + /* These should not be in progress! */ + ASSERT(0); + break; + } + } + + trace_xchk_dirtree_evaluate(dl, oc); +} + +/* Look for directory loops. */ +int +xchk_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree_outcomes oc; + struct xchk_dirtree *dl = sc->buf; + int error; + + /* + * Nondirectories do not point downwards to other files, so they cannot + * cause a cycle in the directory tree. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + ASSERT(xfs_has_parent(sc->mp)); + + /* + * Find the root of the directory tree. Remember which directory to + * scan, because the hook doesn't detach until after sc->ip gets + * released during teardown. + */ + dl->root_ino = sc->mp->m_rootip->i_ino; + dl->scan_ino = sc->ip->i_ino; + + trace_xchk_dirtree_start(sc->ip, sc->sm, 0); + + /* + * Hook into the directory entry code so that we can capture updates to + * paths that we have already scanned. The scanner thread takes each + * directory's ILOCK, which means that any in-progress directory update + * will finish before we can scan the directory. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update); + error = xfs_dir_hook_add(sc->mp, &dl->dhook); + if (error) + goto out; + + mutex_lock(&dl->lock); + + /* Trace each parent pointer's path to the root. */ + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) { + /* + * Don't bother walking the paths if the xattr structure or the + * parent pointers are corrupt; this scan cannot be completed + * without full information. + */ + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_scanlock; + } + if (error == -EBUSY) { + /* + * We couldn't scan some directory's parent pointers because + * the attr fork looked like it had been zapped. The + * scan was marked incomplete, so no further error code + * is necessary. + */ + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + if (dl->aborted) { + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* Assess what we found in our path evaluation. */ + xchk_dirtree_evaluate(dl, &oc); + if (xchk_dirtree_parentless(dl)) { + if (oc.good || oc.bad || oc.suspect) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (oc.bad || oc.good + oc.suspect != 1) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + if (oc.suspect) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + } + +out_scanlock: + mutex_unlock(&dl->lock); +out: + trace_xchk_dirtree_done(sc->ip, sc->sm, error); + return error; +} diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h new file mode 100644 index 000000000000..1e1686365c61 --- /dev/null +++ b/fs/xfs/scrub/dirtree.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DIRTREE_H__ +#define __XFS_SCRUB_DIRTREE_H__ + +/* + * Each of these represents one parent pointer path step in a chain going + * up towards the directory tree root. These are stored inside an xfarray. + */ +struct xchk_dirpath_step { + /* Directory entry name associated with this parent link. */ + xfblob_cookie name_cookie; + unsigned int name_len; + + /* Handle of the parent directory. */ + struct xfs_parent_rec pptr_rec; +}; + +enum xchk_dirpath_outcome { + XCHK_DIRPATH_SCANNING = 0, /* still being put together */ + XCHK_DIRPATH_DELETE, /* delete this path */ + XCHK_DIRPATH_CORRUPT, /* corruption detected in path */ + XCHK_DIRPATH_LOOP, /* cycle detected further up */ + XCHK_DIRPATH_STALE, /* path is stale */ + XCHK_DIRPATH_OK, /* path reaches the root */ + + XREP_DIRPATH_DELETING, /* path is being deleted */ + XREP_DIRPATH_DELETED, /* path has been deleted */ + XREP_DIRPATH_ADOPTING, /* path is being adopted */ + XREP_DIRPATH_ADOPTED, /* path has been adopted */ +}; + +/* + * Each of these represents one parent pointer path out of the directory being + * scanned. These exist in-core, and hopefully there aren't more than a + * handful of them. + */ +struct xchk_dirpath { + struct list_head list; + + /* Index of the first step in this path. */ + xfarray_idx_t first_step; + + /* Index of the second step in this path. */ + xfarray_idx_t second_step; + + /* Inodes seen while walking this path. */ + struct xino_bitmap seen_inodes; + + /* Number of steps in this path. */ + unsigned int nr_steps; + + /* Which path is this? */ + unsigned int path_nr; + + /* What did we conclude from following this path? */ + enum xchk_dirpath_outcome outcome; +}; + +struct xchk_dirtree_outcomes { + /* Number of XCHK_DIRPATH_DELETE */ + unsigned int bad; + + /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */ + unsigned int suspect; + + /* Number of XCHK_DIRPATH_OK */ + unsigned int good; + + /* Directory needs to be added to lost+found */ + bool needs_adoption; +}; + +struct xchk_dirtree { + struct xfs_scrub *sc; + + /* Root inode that we're looking for. */ + xfs_ino_t root_ino; + + /* + * This is the inode that we're scanning. The live update hook can + * continue to be called after xchk_teardown drops sc->ip but before + * it calls buf_cleanup, so we keep a copy. + */ + xfs_ino_t scan_ino; + + /* + * If we start deleting redundant paths to this subdirectory, this is + * the inode number of the surviving parent and the dotdot entry will + * be set to this value. If the value is NULLFSINO, then use @root_ino + * as a stand-in until the orphanage can adopt the subdirectory. + */ + xfs_ino_t parent_ino; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; + + /* Information for reparenting this directory. */ + struct xrep_adoption adoption; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; + + /* Parent pointer update arguments. */ + struct xfs_parent_args ppargs; + + /* lock for everything below here */ + struct mutex lock; + + /* buffer for the live update functions to use for dirent names */ + struct xfs_name hook_xname; + unsigned char hook_namebuf[MAXNAMELEN]; + + /* + * All path steps observed during this scan. Each of the path + * steps for a particular pathwalk are recorded in sequential + * order in the xfarray. A pathwalk ends either with a step + * pointing to the root directory (success) or pointing to NULLFSINO + * (loop detected, empty dir detected, etc). + */ + struct xfarray *path_steps; + + /* All names observed during this scan. */ + struct xfblob *path_names; + + /* All paths being tracked by this scanner. */ + struct list_head path_list; + + /* Number of paths in path_list. */ + unsigned int nr_paths; + + /* Number of parents found by a pptr scan. */ + unsigned int parents_found; + + /* Have the path data been invalidated by a concurrent update? */ + bool stale:1; + + /* Has the scan been aborted? */ + bool aborted:1; +}; + +#define xchk_dirtree_for_each_path_safe(dl, path, n) \ + list_for_each_entry_safe((path), (n), &(dl)->path_list, list) + +#define xchk_dirtree_for_each_path(dl, path) \ + list_for_each_entry((path), &(dl)->path_list, list) + +static inline bool +xchk_dirtree_parentless(const struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + + if (sc->ip == sc->mp->m_rootip) + return true; + if (VFS_I(sc->ip)->i_nlink == 0) + return true; + return false; +} + +int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl); +int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip, + struct xchk_dirpath *path, const struct xfs_name *name, + const struct xfs_parent_rec *pptr); +void xchk_dirtree_evaluate(struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc); + +#endif /* __XFS_SCRUB_DIRTREE_H__ */ diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c new file mode 100644 index 000000000000..5c04e70ba951 --- /dev/null +++ b/fs/xfs/scrub/dirtree_repair.c @@ -0,0 +1,821 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_trans_space.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" +#include "scrub/readdir.h" + +/* + * Directory Tree Structure Repairs + * ================================ + * + * If we decide that the directory being scanned is participating in a + * directory loop, the only change we can make is to remove directory entries + * pointing down to @sc->ip. If that leaves it with no parents, the directory + * should be adopted by the orphanage. + */ + +/* Set up to repair directory loops. */ +int +xrep_setup_dirtree( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* Change the outcome of this path. */ +static inline void +xrep_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* Delete all paths. */ +STATIC void +xrep_dirtree_delete_all_paths( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good == 0); +} + +/* Since this is the surviving path, set the dotdot entry to this value. */ +STATIC void +xrep_dirpath_retain_parent( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return; + + dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino); +} + +/* Find the one surviving path so we know how to set dotdot. */ +STATIC void +xrep_dirtree_find_surviving_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + ASSERT(foundit == false); + break; + default: + break; + } + } + + ASSERT(oc->suspect + oc->good == 1); +} + +/* Delete all paths except for the one good one. */ +STATIC void +xrep_dirtree_keep_one_good_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good < 2); +} + +/* Delete all paths except for one suspect one. */ +STATIC void +xrep_dirtree_keep_one_suspect_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + ASSERT(0); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 1); + ASSERT(oc->good == 0); +} + +/* + * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK + * if the scan results have become stale. + */ +STATIC void +xrep_dirtree_decide_fate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + xchk_dirtree_evaluate(dl, oc); + + /* Parentless directories should not have any paths at all. */ + if (xchk_dirtree_parentless(dl)) { + xrep_dirtree_delete_all_paths(dl, oc); + return; + } + + /* One path is exactly the number of paths we want. */ + if (oc->good + oc->suspect == 1) { + xrep_dirtree_find_surviving_path(dl, oc); + return; + } + + /* Zero paths means we should reattach the subdir to the orphanage. */ + if (oc->good + oc->suspect == 0) { + if (dl->sc->orphanage) + oc->needs_adoption = true; + return; + } + + /* + * Otherwise, this subdirectory has too many parents. If there's at + * least one good path, keep it and delete the others. + */ + if (oc->good > 0) { + xrep_dirtree_keep_one_good_path(dl, oc); + return; + } + + /* + * There are no good paths and there are too many suspect paths. + * Keep the first suspect path and delete the rest. + */ + xrep_dirtree_keep_one_suspect_path(dl, oc); +} + +/* + * Load the first step of this path into @step and @dl->xname/pptr + * for later repair work. + */ +STATIC int +xrep_dirtree_prep_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + int error; + + error = xfarray_load(dl->path_steps, path->first_step, step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname, + step->name_len); + if (error) + return error; + + dl->pptr_rec = step->pptr_rec; /* struct copy */ + return 0; +} + +/* Delete the VFS dentry for a removed child. */ +STATIC int +xrep_dirtree_purge_dentry( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + struct qstr qname = QSTR_INIT(name->name, name->len); + struct dentry *parent_dentry, *child_dentry; + int error = 0; + + /* + * Find the dentry for the parent directory. If there isn't one, we're + * done. Caller already holds i_rwsem for parent and child. + */ + parent_dentry = d_find_alias(VFS_I(dp)); + if (!parent_dentry) + return 0; + + /* The VFS thinks the parent is a directory, right? */ + if (!d_is_dir(parent_dentry)) { + ASSERT(d_is_dir(parent_dentry)); + error = -EFSCORRUPTED; + goto out_dput_parent; + } + + /* + * Try to find the dirent pointing to the child. If there isn't one, + * we're done. + */ + qname.hash = full_name_hash(parent_dentry, name->name, name->len); + child_dentry = d_lookup(parent_dentry, &qname); + if (!child_dentry) { + error = 0; + goto out_dput_parent; + } + + trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry); + + /* Child is not a directory? We're screwed. */ + if (!d_is_dir(child_dentry)) { + ASSERT(d_is_dir(child_dentry)); + error = -EFSCORRUPTED; + goto out_dput_child; + } + + /* Replace the child dentry with a negative one. */ + d_delete(child_dentry); + +out_dput_child: + dput(child_dentry); +out_dput_parent: + dput(parent_dentry); + return error; +} + +/* + * Prepare to delete a link by taking the IOLOCK of the parent and the child + * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we + * took both locks, or a negative errno if we couldn't lock the parent in time. + */ +static inline int +xrep_dirtree_unlink_iolock( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xfs_ilock(dp, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Remove a link from the directory tree and update the dcache. Returns + * -ESTALE if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_unlink( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t dotdot_ino; + xfs_ino_t parent_ino = dl->parent_ino; + unsigned int resblks; + int dontcare; + int error; + + /* Take IOLOCK_EXCL of the parent and child. */ + error = xrep_dirtree_unlink_iolock(sc, dp); + if (error) + return error; + + /* + * Create the transaction that we need to sever the path. Ignore + * EDQUOT and ENOSPC being returned via nospace_error because the + * directory code can handle a reservationless update. + */ + resblks = xfs_remove_space_res(mp, step->name_len); + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip, + &resblks, &sc->tp, &dontcare); + if (error) + goto out_iolock; + + /* + * Cancel if someone invalidate the paths while we were trying to get + * the ILOCK. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans_cancel; + } + xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING); + mutex_unlock(&dl->lock); + + trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr, + &dl->xname, &dl->pptr_rec); + + /* + * Decide if we need to reset the dotdot entry. Rules: + * + * - If there's a surviving parent, we want dotdot to point there. + * - If we don't have any surviving parents, then point dotdot at the + * root dir. + * - If dotdot is already set to the value we want, pass in NULLFSINO + * for no change necessary. + * + * Do this /before/ we dirty anything, in case the dotdot lookup + * fails. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino); + if (error) + goto out_trans_cancel; + if (parent_ino == NULLFSINO) + parent_ino = dl->root_ino; + if (dotdot_ino == parent_ino) + parent_ino = NULLFSINO; + + /* Drop the link from sc->ip's dotdot entry. */ + error = xfs_droplink(sc->tp, dp); + if (error) + goto out_trans_cancel; + + /* Reset the dotdot entry to a surviving parent. */ + if (parent_ino != NULLFSINO) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + parent_ino, 0); + if (error) + goto out_trans_cancel; + } + + /* Drop the link from dp to sc->ip. */ + error = xfs_droplink(sc->tp, sc->ip); + if (error) + goto out_trans_cancel; + + error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino, + resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_trans_cancel; + } + + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_removename(sc->tp, &dl->ppargs, dp, + &dl->xname, sc->ip); + if (error) + goto out_trans_cancel; + } + + /* + * Notify dirent hooks that we removed the bad link, invalidate the + * dcache, and commit the repair. + */ + xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname); + error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname); + if (error) + goto out_trans_cancel; + + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans_cancel: + xchk_trans_cancel(sc); +out_ilock: + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_iolock: + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Delete a directory entry that points to this directory. Returns -ESTALE + * if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_delete_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + int error; + + /* + * Load the parent pointer and directory inode for this path, then + * drop the scan lock, the ILOCK, and the transaction so that + * _delete_path can reserve the proper transaction. This sets up + * @dl->xname for the deletion. + */ + error = xrep_dirtree_prep_path(dl, path, &step); + if (error) + return error; + + error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp); + if (error) + return error; + + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Delete the directory link and release the parent. */ + error = xrep_dirtree_unlink(dl, dp, path, &step); + xchk_irele(sc, dp); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* Add a new path to represent our in-progress adoption. */ +STATIC int +xrep_dirtree_create_adoption_path( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error; + + /* + * We should have capped the number of paths at XFS_MAXLINK-1 in the + * scanner. + */ + if (dl->nr_paths > XFS_MAXLINK) { + ASSERT(dl->nr_paths <= XFS_MAXLINK); + return -EFSCORRUPTED; + } + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XREP_DIRPATH_ADOPTING; + + /* + * Record the new link that we just created in the orphanage. Because + * adoption is the last repair that we perform, we don't bother filling + * in the path all the way back to the root. + */ + xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage); + + error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino); + if (error) + goto out_path; + + trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths, + &dl->xname, &dl->pptr_rec); + + error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname, + &dl->pptr_rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; + +out_path: + kfree(path); + return error; +} + +/* + * Prepare to move a file to the orphanage by taking the IOLOCK of the + * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on + * @sc->ip. Returns 0 if we took both locks, or a negative errno if we + * couldn't lock the orphanage in time. + */ +static inline int +xrep_dirtree_adopt_iolock( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Reattach this orphaned directory to the orphanage. Do not call this with + * any resources held. Returns -ESTALE if the scan data have become out of + * date. + */ +STATIC int +xrep_dirtree_adopt( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* Take the IOLOCK of the orphanage and the scrub target. */ + error = xrep_dirtree_adopt_iolock(sc); + if (error) + return error; + + /* + * Set up for an adoption. The directory tree fixer runs after the + * link counts have been corrected. Therefore, we must bump the + * child's link count since there will be no further opportunity to fix + * errors. + */ + error = xrep_adoption_trans_alloc(sc, &dl->adoption); + if (error) + goto out_iolock; + dl->adoption.bump_child_nlink = true; + + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&dl->adoption, &dl->xname); + if (error) + goto out_trans; + + /* + * Now that we have a proposed name for the orphanage entry, create + * a faux path so that the live update hook will see it. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans; + } + error = xrep_dirtree_create_adoption_path(dl); + mutex_unlock(&dl->lock); + if (error) + goto out_trans; + + /* Reparent the directory. */ + error = xrep_adoption_move(&dl->adoption); + if (error) + goto out_trans; + + /* + * Commit the name and release all inode locks except for the scrub + * target's IOLOCK. + */ + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans: + xchk_trans_cancel(sc); +out_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); +out_iolock: + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return error; +} + +/* + * This newly orphaned directory needs to be adopted by the orphanage. + * Make this happen. + */ +STATIC int +xrep_dirtree_move_to_orphanage( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Start by dropping all the resources that we hold so that we can grab + * all the resources that we need for the adoption. + */ + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Perform the adoption. */ + error = xrep_dirtree_adopt(dl); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* + * Try to fix all the problems. Returns -ESTALE if the scan data have become + * out of date. + */ +STATIC int +xrep_dirtree_fix_problems( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + int error; + + /* Delete all the paths we don't want. */ + xchk_dirtree_for_each_path(dl, path) { + if (path->outcome != XCHK_DIRPATH_DELETE) + continue; + + error = xrep_dirtree_delete_path(dl, path); + if (error) + return error; + } + + /* Reparent this directory to the orphanage. */ + if (oc->needs_adoption) { + if (xrep_orphanage_can_adopt(dl->sc)) + return xrep_dirtree_move_to_orphanage(dl); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Fix directory loops involving this directory. */ +int +xrep_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl = sc->buf; + struct xchk_dirtree_outcomes oc; + int error; + + /* + * Prepare to fix the directory tree by retaking the scan lock. The + * order of resource acquisition is still IOLOCK -> transaction -> + * ILOCK -> scan lock. + */ + mutex_lock(&dl->lock); + do { + /* + * Decide what we're going to do, then do it. An -ESTALE + * return here means the scan results are invalid and we have + * to walk again. + */ + if (!dl->stale) { + xrep_dirtree_decide_fate(dl, &oc); + + trace_xrep_dirtree_decided_fate(dl, &oc); + + error = xrep_dirtree_fix_problems(dl, &oc); + if (!error || error != -ESTALE) + break; + } + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -ELNRNG || error == -ENOSR) + error = -EFSCORRUPTED; + } while (!error); + mutex_unlock(&dl->lock); + + return error; +} diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c new file mode 100644 index 000000000000..20c4daedd48d --- /dev/null +++ b/fs/xfs/scrub/dqiterate.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" + +/* Initialize a dquot iteration cursor. */ +void +xchk_dqiter_init( + struct xchk_dqiter *cursor, + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + cursor->sc = sc; + cursor->bmap.br_startoff = NULLFILEOFF; + cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK; + cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype); + cursor->id = 0; +} + +/* + * Ensure that the cached data fork mapping for the dqiter cursor is fresh and + * covers the dquot pointed to by the scan cursor. + */ +STATIC int +xchk_dquot_iter_revalidate_bmap( + struct xchk_dqiter *cursor) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + xfs_dqid_t this_id = cursor->id; + int nmaps = 1; + int error; + + fileoff = this_id / qi->qi_dqperchunk; + + /* + * If we have a mapping for cursor->id and it's still fresh, there's + * no need to reread the bmbt. + */ + if (cursor->bmap.br_startoff != NULLFILEOFF && + cursor->if_seq == ifp->if_seq && + cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff) + return 0; + + /* Look up the data fork mapping for the dquot id of interest. */ + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0); + if (error) + return error; + if (!nmaps) { + ASSERT(nmaps > 0); + return -EFSCORRUPTED; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id); + return 0; +} + +/* Advance the dqiter cursor to the next non-sparse region of the quota file. */ +STATIC int +xchk_dquot_iter_advance_bmap( + struct xchk_dqiter *cursor, + uint64_t *next_ondisk_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + uint64_t next_id; + int nmaps = 1; + int error; + + /* Find the dquot id for the next non-hole mapping. */ + do { + fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount; + if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, + &nmaps, 0); + if (error) + return error; + if (!nmaps) { + /* Must have reached the end of the mappings. */ + *next_ondisk_id = -1ULL; + return 0; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + } while (!xfs_bmap_is_real_extent(&cursor->bmap)); + + next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk; + if (next_id > XFS_DQ_ID_MAX) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + /* Propose jumping forward to the dquot in the next allocated block. */ + *next_ondisk_id = next_id; + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id); + return 0; +} + +/* + * Find the id of the next highest incore dquot. Normally this will correspond + * exactly with the quota file block mappings, but repair might have erased a + * mapping because it was crosslinked; in that case, we need to re-allocate the + * space so that we can reset q_blkno. + */ +STATIC void +xchk_dquot_iter_advance_incore( + struct xchk_dqiter *cursor, + uint64_t *next_incore_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype); + struct xfs_dquot *dq; + unsigned int nr_found; + + *next_incore_id = -1ULL; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1); + if (nr_found) + *next_incore_id = dq->q_id; + mutex_unlock(&qi->qi_tree_lock); + + trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id); +} + +/* + * Walk all incore dquots of this filesystem. Caller must set *@cursorp to + * zero before the first call, and must not hold the quota file ILOCK. + * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more + * dquots to iterate; or a negative errno. + */ +int +xchk_dquot_iter( + struct xchk_dqiter *cursor, + struct xfs_dquot **dqpp) +{ + struct xfs_mount *mp = cursor->sc->mp; + struct xfs_dquot *dq = NULL; + uint64_t next_ondisk, next_incore = -1ULL; + unsigned int lock_mode; + int error = 0; + + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + next_ondisk = cursor->id; + + /* Revalidate and/or advance the cursor. */ + lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip); + error = xchk_dquot_iter_revalidate_bmap(cursor); + if (!error && !xfs_bmap_is_real_extent(&cursor->bmap)) + error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk); + xfs_iunlock(cursor->quota_ip, lock_mode); + if (error) + return error; + + if (next_ondisk > cursor->id) + xchk_dquot_iter_advance_incore(cursor, &next_incore); + + /* Pick the next dquot in the sequence and return it. */ + cursor->id = min(next_ondisk, next_incore); + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + + trace_xchk_dquot_iter(cursor, cursor->id); + + error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq); + if (error) + return error; + + cursor->id = dq->q_id + 1; + *dqpp = dq; + return 1; +} diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c new file mode 100644 index 000000000000..01766041ba2c --- /dev/null +++ b/fs/xfs/scrub/findparent.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" + +/* + * Finding the Parent of a Directory + * ================================= + * + * Directories have parent pointers, in the sense that each directory contains + * a dotdot entry that points to the single allowed parent. The brute force + * way to find the parent of a given directory is to scan every directory in + * the filesystem looking for a child dirent that references this directory. + * + * This module wraps the process of scanning the directory tree. It requires + * that @sc->ip is the directory whose parent we want to find, and that the + * caller hold only the IOLOCK on that directory. The scan itself needs to + * take the ILOCK of each directory visited. + * + * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is + * necessary to use dirent hook to update the parent scan results. Callers + * must not read the scan results without re-taking @sc->ip's ILOCK. + * + * There are a few shortcuts that we can take to avoid scanning the entire + * filesystem, such as noticing directory tree roots and querying the dentry + * cache for parent information. + */ + +struct xrep_findparent_info { + /* The directory currently being scanned. */ + struct xfs_inode *dp; + + /* + * Scrub context. We're looking for a @dp containing a directory + * entry pointing to sc->ip->i_ino. + */ + struct xfs_scrub *sc; + + /* Optional scan information for a xrep_findparent_scan call. */ + struct xrep_parent_scan_info *parent_scan; + + /* + * Parent that we've found for sc->ip. If we're scanning the entire + * directory tree, we need this to ensure that we only find /one/ + * parent directory. + */ + xfs_ino_t found_parent; + + /* + * This is set to true if @found_parent was not observed directly from + * the directory scan but by noticing a change in dotdot entries after + * cycling the sc->ip IOLOCK. + */ + bool parent_tentative; +}; + +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_findparent_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_findparent_info *fpi = priv; + int error = 0; + + if (xchk_should_terminate(fpi->sc, &error)) + return error; + + if (ino != fpi->sc->ip->i_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* + * Ignore dotdot and dot entries -- we're looking for parent -> child + * links only. + */ + if (name->name[0] == '.' && (name->len == 1 || + (name->len == 2 && name->name[1] == '.'))) + return 0; + + /* Uhoh, more than one parent for a dir? */ + if (fpi->found_parent != NULLFSINO && + !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) { + trace_xrep_findparent_dirent(fpi->sc->ip, 0); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember this. */ + trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino); + fpi->found_parent = fpi->dp->i_ino; + fpi->parent_tentative = false; + + if (fpi->parent_scan) + xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino); + + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_findparent_walk_directory( + struct xrep_findparent_info *fpi) +{ + struct xfs_scrub *sc = fpi->sc; + struct xfs_inode *dp = fpi->dp; + unsigned int lock_mode; + int error = 0; + + /* + * The inode being scanned cannot be its own parent, nor can any + * temporary directory we created to stage this repair. + */ + if (dp == sc->ip || dp == sc->tempip) + return 0; + + /* + * Similarly, temporary files created to stage a repair cannot be the + * parent of this inode. + */ + if (xrep_is_tempfile(dp)) + return 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Update this directory's dotdot pointer based on ongoing dirent updates. + */ +STATIC int +xrep_findparent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent_scan_info *pscan; + struct xfs_scrub *sc; + + pscan = container_of(nb, struct xrep_parent_scan_info, + dhook.dirent_hook.nb); + sc = pscan->sc; + + /* + * If @p->ip is the subdirectory that we're interested in and we've + * already scanned @p->dp, update the dotdot target inumber to the + * parent inode. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) { + if (p->delta > 0) { + xrep_findparent_scan_found(pscan, p->dp->i_ino); + } else { + xrep_findparent_scan_found(pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +} + +/* + * Set up a scan to find the parent of a directory. The provided dirent hook + * will be called when there is a dotdot update for the inode being repaired. + */ +int +__xrep_findparent_scan_start( + struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn) +{ + int error; + + if (!(sc->flags & XCHK_FSGATES_DIRENTS)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + return -EINVAL; + } + + pscan->sc = sc; + pscan->parent_ino = NULLFSINO; + + mutex_init(&pscan->lock); + + xchk_iscan_start(sc, 30000, 100, &pscan->iscan); + + /* + * Hook into the dirent update code. The hook only operates on inodes + * that were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + */ + if (custom_fn) + xfs_dir_hook_setup(&pscan->dhook, custom_fn); + else + xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update); + error = xfs_dir_hook_add(sc->mp, &pscan->dhook); + if (error) + goto out_iscan; + + return 0; +out_iscan: + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); + return error; +} + +/* + * Scan the entire filesystem looking for a parent inode for the inode being + * scrubbed. @sc->ip must not be the root of a directory tree. Callers must + * not hold a dirty transaction or any lock that would interfere with taking + * an ILOCK. + * + * Returns 0 with @pscan->parent_ino set to the parent that we found. + * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_scan( + struct xrep_parent_scan_info *pscan) +{ + struct xrep_findparent_info fpi = { + .sc = pscan->sc, + .found_parent = NULLFSINO, + .parent_scan = pscan, + }; + struct xfs_scrub *sc = pscan->sc; + int ret; + + ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode)); + + while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) { + if (S_ISDIR(VFS_I(fpi.dp)->i_mode)) + ret = xrep_findparent_walk_directory(&fpi); + else + ret = 0; + xchk_iscan_mark_visited(&pscan->iscan, fpi.dp); + xchk_irele(sc, fpi.dp); + if (ret) + break; + + if (xchk_should_terminate(sc, &ret)) + break; + } + xchk_iscan_iter_finish(&pscan->iscan); + + return ret; +} + +/* Tear down a parent scan. */ +void +xrep_findparent_scan_teardown( + struct xrep_parent_scan_info *pscan) +{ + xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook); + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); +} + +/* Finish a parent scan early. */ +void +xrep_findparent_scan_finish_early( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + xrep_findparent_scan_found(pscan, ino); + xchk_iscan_finish_early(&pscan->iscan); +} + +/* + * Confirm that the directory @parent_ino actually contains a directory entry + * pointing to the child @sc->ip->ino. This function returns one of several + * ways: + * + * Returns 0 with @parent_ino unchanged if the parent was confirmed. + * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_confirm( + struct xfs_scrub *sc, + xfs_ino_t *parent_ino) +{ + struct xrep_findparent_info fpi = { + .sc = sc, + .found_parent = NULLFSINO, + }; + int error; + + /* + * The root directory always points to itself. Unlinked dirs can point + * anywhere, so we point them at the root dir too. + */ + if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) { + *parent_ino = sc->mp->m_sb.sb_rootino; + return 0; + } + + /* Reject garbage parent inode numbers and self-referential parents. */ + if (*parent_ino == NULLFSINO) + return 0; + if (!xfs_verify_dir_ino(sc->mp, *parent_ino) || + *parent_ino == sc->ip->i_ino) { + *parent_ino = NULLFSINO; + return 0; + } + + error = xchk_iget(sc, *parent_ino, &fpi.dp); + if (error) + return error; + + if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) { + *parent_ino = NULLFSINO; + goto out_rele; + } + + error = xrep_findparent_walk_directory(&fpi); + if (error) + goto out_rele; + + *parent_ino = fpi.found_parent; +out_rele: + xchk_irele(sc, fpi.dp); + return error; +} + +/* + * If we're the root of a directory tree, we are our own parent. If we're an + * unlinked directory, the parent /won't/ have a link to us. Set the parent + * directory to the root for both cases. Returns NULLFSINO if we don't know + * what to do. + */ +xfs_ino_t +xrep_findparent_self_reference( + struct xfs_scrub *sc) +{ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino) + return sc->mp->m_sb.sb_rootino; + + if (VFS_I(sc->ip)->i_nlink == 0) + return sc->mp->m_sb.sb_rootino; + + return NULLFSINO; +} + +/* Check the dentry cache to see if knows of a parent for the scrub target. */ +xfs_ino_t +xrep_findparent_from_dcache( + struct xfs_scrub *sc) +{ + struct inode *pip = NULL; + struct dentry *dentry, *parent; + xfs_ino_t ret = NULLFSINO; + + dentry = d_find_alias(VFS_I(sc->ip)); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + ASSERT(parent->d_sb == sc->ip->i_mount->m_super); + + pip = igrab(d_inode(parent)); + dput(parent); + + if (S_ISDIR(pip->i_mode)) { + trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino); + ret = XFS_I(pip)->i_ino; + } + + xchk_irele(sc, XFS_I(pip)); + +out_dput: + dput(dentry); +out: + return ret; +} diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h new file mode 100644 index 000000000000..d998c7a88152 --- /dev/null +++ b/fs/xfs/scrub/findparent.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FINDPARENT_H__ +#define __XFS_SCRUB_FINDPARENT_H__ + +struct xrep_parent_scan_info { + struct xfs_scrub *sc; + + /* Inode scan cursor. */ + struct xchk_iscan iscan; + + /* Hook to capture directory entry updates. */ + struct xfs_dir_hook dhook; + + /* Lock protecting parent_ino. */ + struct mutex lock; + + /* Parent inode that we've found. */ + xfs_ino_t parent_ino; + + bool lookup_parent; +}; + +int __xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn); +static inline int xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan) +{ + return __xrep_findparent_scan_start(sc, pscan, NULL); +} +int xrep_findparent_scan(struct xrep_parent_scan_info *pscan); +void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan); + +static inline void +xrep_findparent_scan_found( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + mutex_lock(&pscan->lock); + pscan->parent_ino = ino; + mutex_unlock(&pscan->lock); +} + +void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan, + xfs_ino_t ino); + +int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino); + +xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc); +xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_FINDPARENT_H__ */ diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h new file mode 100644 index 000000000000..40b462c1dd0d --- /dev/null +++ b/fs/xfs/scrub/fsb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSB_BITMAP_H__ +#define __XFS_SCRUB_FSB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fsblock_t */ + +struct xfsb_bitmap { + struct xbitmap64 fsbitmap; +}; + +static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->fsbitmap); +} + +static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->fsbitmap); +} + +static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap, + xfs_fsblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->fsbitmap, start, len); +} + +static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->fsbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 5799e9a94f1f..454f17595c9c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -22,6 +22,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" +#include "scrub/fscounters.h" /* * FS Summary Counters @@ -48,17 +49,6 @@ * our tolerance for mismatch between expected and actual counter values. */ -struct xchk_fscounters { - struct xfs_scrub *sc; - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - uint64_t frextents; - unsigned long long icount_min; - unsigned long long icount_max; - bool frozen; -}; - /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -95,7 +85,7 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); if (error) break; error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); @@ -235,14 +225,19 @@ xchk_setup_fscounters( * Pause all writer activity in the filesystem while we're scrubbing to * reduce the likelihood of background perturbations to the counters * throwing off our calculations. + * + * If we're repairing, we need to prevent any other thread from + * changing the global fs summary counters while we're repairing them. + * This requires the fs to be frozen, which will disable background + * reclaim and purge all inactive inodes. */ - if (sc->flags & XCHK_TRY_HARDER) { + if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { error = xchk_fscounters_freeze(sc); if (error) return error; } - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* @@ -254,7 +249,9 @@ xchk_setup_fscounters( * set the INCOMPLETE flag even when a negative errno is returned. This care * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by - * xchk_*_process_error. + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. */ /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ @@ -264,7 +261,7 @@ xchk_fscount_btreeblks( struct xchk_fscounters *fsc, xfs_agnumber_t agno) { - xfs_extlen_t blocks; + xfs_filblks_t blocks; int error; error = xchk_ag_init_existing(sc, agno, &sc->sa); @@ -415,10 +412,11 @@ xchk_fscount_count_frextents( int error; fsc->frextents = 0; + fsc->frextents_delayed = 0; if (!xfs_has_realtime(mp)) return 0; - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP); error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_fscount_add_frextent, fsc); if (error) { @@ -426,8 +424,10 @@ xchk_fscount_count_frextents( goto out_unlock; } + fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); + out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP); return error; } #else @@ -437,6 +437,7 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { fsc->frextents = 0; + fsc->frextents_delayed = 0; return 0; } #endif /* CONFIG_XFS_RT */ @@ -482,6 +483,10 @@ xchk_fscount_within_range( if (curr_value == expected) return true; + /* We require exact matches when repair is running. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return false; + min_value = min(old_value, curr_value); max_value = max(old_value, curr_value); @@ -516,7 +521,7 @@ xchk_fscounters( /* * If the filesystem is not frozen, the counter summation calls above - * can race with xfs_mod_freecounter, which subtracts a requested space + * can race with xfs_dec_freecounter, which subtracts a requested space * reservation from the counter and undoes the subtraction if that made * the counter go negative. Therefore, it's possible to see negative * values here, and we should only flag that as a corruption if we @@ -592,7 +597,7 @@ xchk_fscounters( } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) { + fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); else diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h new file mode 100644 index 000000000000..bcf56e1c36f9 --- /dev/null +++ b/fs/xfs/scrub/fscounters.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSCOUNTERS_H__ +#define __XFS_SCRUB_FSCOUNTERS_H__ + +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + uint64_t frextents_delayed; + unsigned long long icount_min; + unsigned long long icount_max; + bool frozen; +}; + +#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */ diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c new file mode 100644 index 000000000000..469bf645dbea --- /dev/null +++ b/fs/xfs/scrub/fscounters_repair.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/fscounters.h" + +/* + * FS Summary Counters + * =================== + * + * We correct errors in the filesystem summary counters by setting them to the + * values computed during the obligatory scrub phase. However, we must be + * careful not to allow any other thread to change the counters while we're + * computing and setting new values. To achieve this, we freeze the + * filesystem for the whole operation if the REPAIR flag is set. The checking + * function is stricter when we've frozen the fs. + */ + +/* + * Reset the superblock counters. Caller is responsible for freezing the + * filesystem during the calculation and reset phases. + */ +int +xrep_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + + /* + * Reinitialize the in-core counters from what we computed. We froze + * the filesystem, so there shouldn't be anyone else trying to modify + * these counters. + */ + if (!fsc->frozen) { + ASSERT(fsc->frozen); + return -EFSCORRUPTED; + } + + trace_xrep_reset_counters(mp, fsc); + + percpu_counter_set(&mp->m_icount, fsc->icount); + percpu_counter_set(&mp->m_ifree, fsc->ifree); + percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + + /* + * Online repair is only supported on v5 file systems, which require + * lazy sb counters and thus no update of sb_fdblocks here. But as of + * now we don't support lazy counting sb_frextents yet, and thus need + * to also update it directly here. And for that we need to keep + * track of the delalloc reservations separately, as they are are + * subtracted from m_frextents, but not included in sb_frextents. + */ + percpu_counter_set(&mp->m_frextents, + fsc->frextents - fsc->frextents_delayed); + mp->m_sb.sb_frextents = fsc->frextents; + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 5e2b09ed6e29..b712a8bd34f5 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -10,12 +10,11 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/health.h" +#include "scrub/common.h" /* * Scrub and In-Core Filesystem Health Assessments @@ -107,6 +106,9 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, + [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, }; /* Return the health status mask for this scrub type. */ @@ -118,6 +120,56 @@ xchk_health_mask_for_scrub_type( } /* + * If the scrub state is clean, add @mask to the scrub sick mask to clear + * additional sick flags from the metadata object's sick state. + */ +void +xchk_mark_healthy_if_clean( + struct xfs_scrub *sc, + unsigned int mask) +{ + if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + sc->sick_mask |= mask; +} + +/* + * If we're scrubbing a piece of file metadata for the first time, does it look + * like it has been zapped? Skip the check if we just repaired the metadata + * and are revalidating it. + */ +bool +xchk_file_looks_zapped( + struct xfs_scrub *sc, + unsigned int mask) +{ + ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0); + + if (sc->flags & XREP_ALREADY_FIXED) + return false; + + return xfs_inode_has_sickness(sc->ip, mask); +} + +/* + * Scrub gave the filesystem a clean bill of health, so clear all the indirect + * markers of past problems (at least for the fs and ags) so that we can be + * healthy again. + */ +STATIC void +xchk_mark_all_healthy( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); + xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); + for_each_perag(mp, agno, pag) + xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); +} + +/* * Update filesystem health assessments based on what we found and did. * * If the scrubber finds errors, we mark sick whatever's mentioned in @@ -134,6 +186,18 @@ xchk_update_health( struct xfs_perag *pag; bool bad; + /* + * The HEALTHY scrub type is a request from userspace to clear all the + * indirect flags after a clean scan of the entire filesystem. As such + * there's no sick flag defined for it, so we branch here ahead of the + * mask check. + */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY && + !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xchk_mark_all_healthy(sc->mp); + return; + } + if (!sc->sick_mask) return; @@ -143,7 +207,7 @@ xchk_update_health( case XHG_AG: pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_sick(pag, sc->sick_mask); + xfs_ag_mark_corrupt(pag, sc->sick_mask); else xfs_ag_mark_healthy(pag, sc->sick_mask); xfs_perag_put(pag); @@ -151,20 +215,30 @@ xchk_update_health( case XHG_INO: if (!sc->ip) return; - if (bad) - xfs_inode_mark_sick(sc->ip, sc->sick_mask); - else + if (bad) { + unsigned int mask = sc->sick_mask; + + /* + * If we're coming in for repairs then we don't want + * sickness flags to propagate to the incore health + * status if the inode gets inactivated before we can + * fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + xfs_inode_mark_corrupt(sc->ip, mask); + } else xfs_inode_mark_healthy(sc->ip, sc->sick_mask); break; case XHG_FS: if (bad) - xfs_fs_mark_sick(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); else xfs_fs_mark_healthy(sc->mp, sc->sick_mask); break; case XHG_RT: if (bad) - xfs_rt_mark_sick(sc->mp, sc->sick_mask); + xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); else xfs_rt_mark_healthy(sc->mp, sc->sick_mask); break; @@ -175,13 +249,13 @@ xchk_update_health( } /* Is the given per-AG btree healthy enough for scanning? */ -bool -xchk_ag_btree_healthy_enough( +void +xchk_ag_btree_del_cursor_if_sick( struct xfs_scrub *sc, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_btree_cur **curp, + unsigned int sm_type) { - unsigned int mask = 0; + unsigned int mask = (*curp)->bc_ops->sick_mask; /* * We always want the cursor if it's the same type as whatever we're @@ -190,41 +264,8 @@ xchk_ag_btree_healthy_enough( * Otherwise, we're only interested in the btree for cross-referencing. * If we know the btree is bad then don't bother, just set XFAIL. */ - switch (btnum) { - case XFS_BTNUM_BNO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) - return true; - mask = XFS_SICK_AG_BNOBT; - break; - case XFS_BTNUM_CNT: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) - return true; - mask = XFS_SICK_AG_CNTBT; - break; - case XFS_BTNUM_INO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) - return true; - mask = XFS_SICK_AG_INOBT; - break; - case XFS_BTNUM_FINO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - return true; - mask = XFS_SICK_AG_FINOBT; - break; - case XFS_BTNUM_RMAP: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) - return true; - mask = XFS_SICK_AG_RMAPBT; - break; - case XFS_BTNUM_REFC: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) - return true; - mask = XFS_SICK_AG_REFCNTBT; - break; - default: - ASSERT(0); - return true; - } + if (sc->sm->sm_type == sm_type) + return; /* * If we just repaired some AG metadata, sc->sick_mask will reflect all @@ -236,10 +277,42 @@ xchk_ag_btree_healthy_enough( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness(pag, mask)) { + if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; - return false; + xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); + *curp = NULL; + } +} + +/* + * Quick scan to double-check that there isn't any evidence of lingering + * primary health problems. If we're still clear, then the health update will + * take care of clearing the indirect evidence. + */ +int +xchk_health_record( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + unsigned int sick; + unsigned int checked; + + xfs_fs_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_FS_PRIMARY) + xchk_set_corrupt(sc); + + xfs_rt_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_RT_PRIMARY) + xchk_set_corrupt(sc); + + for_each_perag(mp, agno, pag) { + xfs_ag_measure_sickness(pag, &sick, &checked); + if (sick & XFS_SICK_AG_PRIMARY) + xchk_set_corrupt(sc); } - return true; + return 0; } diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 66a273f8585b..63fc426eb5ae 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -8,7 +8,10 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); -bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, - xfs_btnum_t btnum); +void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc, + struct xfs_btree_cur **curp, unsigned int sm_type); +void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); +bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); +int xchk_health_record(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fb7bbf47ae5d..a59c44e5903a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -76,7 +76,7 @@ xchk_inobt_xref_finobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + ASSERT(xfs_btree_is_fino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -179,7 +179,7 @@ xchk_finobt_xref_inobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment( * Otherwise, we expect that the finobt record is aligned to the * cluster alignment as told by the superblock. */ - if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + if (xfs_btree_is_fino(bs->cur->bc_ops)) { unsigned int imask; imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, @@ -585,7 +585,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -649,12 +649,11 @@ out: */ STATIC void xchk_iallocbt_xref_rmap_btreeblks( - struct xfs_scrub *sc, - int which) + struct xfs_scrub *sc) { xfs_filblks_t blocks; - xfs_extlen_t inobt_blocks = 0; - xfs_extlen_t finobt_blocks = 0; + xfs_filblks_t inobt_blocks = 0; + xfs_filblks_t finobt_blocks = 0; int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || @@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks( STATIC void xchk_iallocbt_xref_rmap_inodes( struct xfs_scrub *sc, - int which, unsigned long long inodes) { xfs_filblks_t blocks; @@ -708,11 +706,10 @@ xchk_iallocbt_xref_rmap_inodes( xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } -/* Scrub the inode btrees for some AG. */ -STATIC int +/* Scrub one of the inode btrees for some AG. */ +int xchk_iallocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { @@ -722,13 +719,24 @@ xchk_iallocbt( }; int error; - cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_INOBT: + cur = sc->sa.ino_cur; + break; + case XFS_SCRUB_TYPE_FINOBT: + cur = sc->sa.fino_cur; + break; + default: + ASSERT(0); + return -EIO; + } + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, &iabt); if (error) return error; - xchk_iallocbt_xref_rmap_btreeblks(sc, which); + xchk_iallocbt_xref_rmap_btreeblks(sc); /* * If we're scrubbing the inode btree, inode_blocks is the number of @@ -737,26 +745,11 @@ xchk_iallocbt( * knows about. We can't do this for the finobt since it only points * to inode chunks with free inodes. */ - if (which == XFS_BTNUM_INO) - xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); - + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes); return error; } -int -xchk_inobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_INO); -} - -int -xchk_finobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_FINO); -} - /* See if an inode btree has (or doesn't have) an inode chunk record. */ static inline void xchk_xref_inode_check( diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c new file mode 100644 index 000000000000..c8d2196a04e1 --- /dev/null +++ b/fs/xfs/scrub/ialloc_repair.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Btree Repair + * ================== + * + * A quick refresher of inode btrees on a v5 filesystem: + * + * - Inode records are read into memory in units of 'inode clusters'. However + * many inodes fit in a cluster buffer is the smallest number of inodes that + * can be allocated or freed. Clusters are never smaller than one fs block + * though they can span multiple blocks. The size (in fs blocks) is + * computed with xfs_icluster_size_fsb(). The fs block alignment of a + * cluster is computed with xfs_ialloc_cluster_alignment(). + * + * - Each inode btree record can describe a single 'inode chunk'. The chunk + * size is defined to be 64 inodes. If sparse inodes are enabled, every + * inobt record must be aligned to the chunk size; if not, every record must + * be aligned to the start of a cluster. It is possible to construct an XFS + * geometry where one inobt record maps to multiple inode clusters; it is + * also possible to construct a geometry where multiple inobt records map to + * different parts of one inode cluster. + * + * - If sparse inodes are not enabled, the smallest unit of allocation for + * inode records is enough to contain one inode chunk's worth of inodes. + * + * - If sparse inodes are enabled, the holemask field will be active. Each + * bit of the holemask represents 4 potential inodes; if set, the + * corresponding space does *not* contain inodes and must be left alone. + * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation + * of inode records is one inode cluster. + * + * So what's the rebuild algorithm? + * + * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT + * records. The OWN_INOBT records are the old inode btree blocks and will be + * cleared out after we've rebuilt the tree. Each possible inode cluster + * within an OWN_INODES record will be read in; for each possible inobt record + * associated with that cluster, compute the freemask calculated from the + * i_mode data in the inode chunk. For sparse inodes the holemask will be + * calculated by creating the properly aligned inobt record and punching out + * any chunk that's missing. Inode allocations and frees grab the AGI first, + * so repair protects itself from concurrent access by locking the AGI. + * + * Once we've reconstructed all the inode records, we can create new inode + * btree roots and reload the btrees. We rebuild both inode trees at the same + * time because they have the same rmap owner and it would be more complex to + * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT + * blocks it owns. We have all the data we need to build both, so dump + * everything and start over. + * + * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. + */ + +struct xrep_ibt { + /* Record under construction. */ + struct xfs_inobt_rec_incore rie; + + /* new inobt information */ + struct xrep_newbt new_inobt; + + /* new finobt information */ + struct xrep_newbt new_finobt; + + /* Old inode btree blocks we found in the rmap. */ + struct xagb_bitmap old_iallocbt_blocks; + + /* Reconstructed inode records. */ + struct xfarray *inode_records; + + struct xfs_scrub *sc; + + /* Number of inodes assigned disk space. */ + unsigned int icount; + + /* Number of inodes in use. */ + unsigned int iused; + + /* Number of finobt records needed. */ + unsigned int finobt_recs; + + /* get_records()'s position in the inode record array. */ + xfarray_idx_t array_cur; +}; + +/* + * Is this inode in use? If the inode is in memory we can tell from i_mode, + * otherwise we have to check di_mode in the on-disk buffer. We only care + * that the high (i.e. non-permission) bits of _mode are zero. This should be + * safe because repair keeps all AG headers locked until the end, and process + * trying to perform an inode allocation/free must lock the AGI. + * + * @cluster_ag_base is the inode offset of the cluster within the AG. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ +STATIC int +xrep_ibt_check_ifree( + struct xrep_ibt *ri, + xfs_agino_t cluster_ag_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index, + bool *inuse) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; + unsigned int cluster_buf_base; + unsigned int offset; + int error; + + agino = cluster_ag_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* Inode uncached or half assembled, read disk buffer */ + cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) + return -EFSCORRUPTED; + dip = xfs_buf_offset(cluster_bp, offset); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) + return -EFSCORRUPTED; + + if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + return -EFSCORRUPTED; + + /* Will the in-core inode tell us if it's in use? */ + error = xchk_inode_is_allocated(sc, agino, inuse); + if (!error) + return 0; + + *inuse = dip->di_mode != 0; + return 0; +} + +/* Stash the accumulated inobt record for rebuilding. */ +STATIC int +xrep_ibt_stash( + struct xrep_ibt *ri) +{ + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); + if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) + return -EFSCORRUPTED; + + if (ri->rie.ir_freecount > 0) + ri->finobt_recs++; + + trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + + error = xfarray_append(ri->inode_records, &ri->rie); + if (error) + return error; + + ri->rie.ir_startino = NULLAGINO; + return 0; +} + +/* + * Given an extent of inodes and an inode cluster buffer, calculate the + * location of the corresponding inobt record (creating it if necessary), + * then update the parts of the holemask and freemask of that record that + * correspond to the inode extent we were given. + * + * @cluster_ir_startino is the AG inode number of an inobt record that we're + * proposing to create for this inode cluster. If sparse inodes are enabled, + * we must round down to a chunk boundary to find the actual sparse record. + * @cluster_bp is the buffer of the inode cluster. + * @nr_inodes is the number of inodes to check from the cluster. + */ +STATIC int +xrep_ibt_cluster_record( + struct xrep_ibt *ri, + xfs_agino_t cluster_ir_startino, + struct xfs_buf *cluster_bp, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + xfs_agino_t ir_startino; + unsigned int cluster_base; + unsigned int cluster_index; + int error = 0; + + ir_startino = cluster_ir_startino; + if (xfs_has_sparseinodes(mp)) + ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); + cluster_base = cluster_ir_startino - ir_startino; + + /* + * If the accumulated inobt record doesn't map this cluster, add it to + * the list and reset it. + */ + if (ri->rie.ir_startino != NULLAGINO && + ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { + error = xrep_ibt_stash(ri); + if (error) + return error; + } + + if (ri->rie.ir_startino == NULLAGINO) { + ri->rie.ir_startino = ir_startino; + ri->rie.ir_free = XFS_INOBT_ALL_FREE; + ri->rie.ir_holemask = 0xFFFF; + ri->rie.ir_count = 0; + } + + /* Record the whole cluster. */ + ri->icount += nr_inodes; + ri->rie.ir_count += nr_inodes; + ri->rie.ir_holemask &= ~xfs_inobt_maskn( + cluster_base / XFS_INODES_PER_HOLEMASK_BIT, + nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); + + /* Which inodes within this cluster are free? */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + bool inuse = false; + + error = xrep_ibt_check_ifree(ri, cluster_ir_startino, + cluster_bp, cluster_index, &inuse); + if (error) + return error; + if (!inuse) + continue; + ri->iused++; + ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + + cluster_index); + } + return 0; +} + +/* + * For each inode cluster covering the physical extent recorded by the rmapbt, + * we must calculate the properly aligned startino of that cluster, then + * iterate each cluster to fill in used and filled masks appropriately. We + * then use the (startino, used, filled) information to construct the + * appropriate inode records. + */ +STATIC int +xrep_ibt_process_cluster( + struct xrep_ibt *ri, + xfs_agblock_t cluster_bno) +{ + struct xfs_imap imap; + struct xfs_buf *cluster_bp; + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t cluster_ag_base; + xfs_agino_t irec_index; + unsigned int nr_inodes; + int error; + + nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, + XFS_INODES_PER_CHUNK); + + /* + * Grab the inode cluster buffer. This is safe to do with a broken + * inobt because imap_to_bp directly maps the buffer without touching + * either inode btree. + */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + imap.im_boffset = 0; + error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); + if (error) + return error; + + /* + * Record the contents of each possible inobt record mapping this + * cluster. + */ + cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); + for (irec_index = 0; + irec_index < igeo->inodes_per_cluster; + irec_index += XFS_INODES_PER_CHUNK) { + error = xrep_ibt_cluster_record(ri, + cluster_ag_base + irec_index, cluster_bp, + nr_inodes); + if (error) + break; + + } + + xfs_trans_brelse(sc->tp, cluster_bp); + return error; +} + +/* Check for any obvious conflicts in the inode chunk extent. */ +STATIC int +xrep_ibt_check_inode_ext( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t agino; + enum xbtree_recpacking outcome; + int error; + + /* Inode records must be within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) + return -EFSCORRUPTED; + + /* The entire record must align to the inode cluster size. */ + if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || + !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) + return -EFSCORRUPTED; + + /* + * The entire record must also adhere to the inode cluster alignment + * size if sparse inodes are not enabled. + */ + if (!xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, igeo->cluster_align) || + !IS_ALIGNED(agbno + len, igeo->cluster_align))) + return -EFSCORRUPTED; + + /* + * On a sparse inode fs, this cluster could be part of a sparse chunk. + * Sparse clusters must be aligned to sparse chunk alignment. + */ + if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align && + (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || + !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) + return -EFSCORRUPTED; + + /* Make sure the entire range of blocks are valid AG inodes. */ + agino = XFS_AGB_TO_AGINO(mp, agbno); + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Found a fragment of the old inode btrees; dispose of them later. */ +STATIC int +xrep_ibt_record_old_btree_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record extents that belong to inode cluster blocks. */ +STATIC int +xrep_ibt_record_inode_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t cluster_base; + int error; + + error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + + trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, + rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, + rec->rm_offset, rec->rm_flags); + + /* + * Record the free/hole masks for each inode cluster that could be + * mapped by this rmap record. + */ + for (cluster_base = 0; + cluster_base < rec->rm_blockcount; + cluster_base += igeo->blocks_per_cluster) { + error = xrep_ibt_process_cluster(ri, + rec->rm_startblock + cluster_base); + if (error) + return error; + } + + return 0; +} + +STATIC int +xrep_ibt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_ibt *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + switch (rec->rm_owner) { + case XFS_RMAP_OWN_INOBT: + return xrep_ibt_record_old_btree_blocks(ri, rec); + case XFS_RMAP_OWN_INODES: + return xrep_ibt_record_inode_blocks(ri, rec); + } + return 0; +} + +/* + * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode + * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct + * the inode btrees. The caller must clean up the lists if anything goes + * wrong. + */ +STATIC int +xrep_ibt_find_inodes( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + ri->rie.ir_startino = NULLAGINO; + + /* Collect all reverse mappings for inode blocks. */ + xrep_ag_btcur_init(sc, &sc->sa); + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* If we have a record ready to go, add it to the array. */ + if (ri->rie.ir_startino != NULLAGINO) + return xrep_ibt_stash(ri); + + return 0; +} + +/* Update the AGI counters. */ +STATIC int +xrep_ibt_reset_counters( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + unsigned int freecount = ri->icount - ri->iused; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* + * The AGI header contains extra information related to the inode + * btrees, so we must update those fields here. + */ + agi->agi_count = cpu_to_be32(ri->icount); + agi->agi_freecount = cpu_to_be32(freecount); + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagi(sc); +} + +/* Retrieve finobt data for bulk load. */ +STATIC int +xrep_fibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(ri->inode_records, + ri->array_cur++, irec); + } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Retrieve inobt data for bulk load. */ +STATIC int +xrep_ibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(ri->inode_records, ri->array_cur++, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new inobt blocks to the bulk loader. */ +STATIC int +xrep_ibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); +} + +/* Feed one of the new finobt blocks to the bulk loader. */ +STATIC int +xrep_fibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); +} + +/* Make sure the records do not overlap in inumber address space. */ +STATIC int +xrep_ibt_check_overlap( + struct xrep_ibt *ri) +{ + struct xfs_inobt_rec_incore irec; + xfarray_idx_t cur; + xfs_agino_t next_agino = 0; + int error = 0; + + foreach_xfarray_idx(ri->inode_records, cur) { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + error = xfarray_load(ri->inode_records, cur, &irec); + if (error) + return error; + + if (irec.ir_startino < next_agino) + return -EFSCORRUPTED; + + next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; + } + + return error; +} + +/* Build new inode btrees and dispose of the old one. */ +STATIC int +xrep_ibt_build_new_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur = NULL; + xfs_fsblock_t fsbno; + bool need_finobt; + int error; + + need_finobt = xfs_has_finobt(sc->mp); + + /* + * Create new btrees for staging all the inobt records we collected + * earlier. The records were collected in order of increasing agino, + * so we do not have to sort them. Ensure there are no overlapping + * records. + */ + error = xrep_ibt_check_overlap(ri); + if (error) + return error; + + /* + * The new inode btrees will not be rooted in the AGI until we've + * successfully rebuilt the tree. + * + * Start by setting up the inobt staging cursor. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_IBT_BLOCK(sc->mp)); + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + XFS_AG_RESV_NONE); + ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; + ri->new_inobt.bload.get_records = xrep_ibt_get_records; + + ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL); + xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); + error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, + xfarray_length(ri->inode_records)); + if (error) + goto err_inocur; + + /* Set up finobt staging cursor. */ + if (need_finobt) { + enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; + + if (sc->mp->m_finobt_nores) + resv = XFS_AG_RESV_NONE; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_FIBT_BLOCK(sc->mp)); + xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, + fsbno, resv); + ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; + ri->new_finobt.bload.get_records = xrep_fibt_get_records; + + fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL); + xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); + error = xfs_btree_bload_compute_geometry(fino_cur, + &ri->new_finobt.bload, ri->finobt_recs); + if (error) + goto err_finocur; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_finocur; + + /* Reserve all the space we need to build the new btrees. */ + error = xrep_newbt_alloc_blocks(&ri->new_inobt, + ri->new_inobt.bload.nr_blocks); + if (error) + goto err_finocur; + + if (need_finobt) { + error = xrep_newbt_alloc_blocks(&ri->new_finobt, + ri->new_finobt.bload.nr_blocks); + if (error) + goto err_finocur; + } + + /* Add all inobt records. */ + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); + if (error) + goto err_finocur; + + /* Add all finobt records. */ + if (need_finobt) { + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); + if (error) + goto err_finocur; + } + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(ino_cur, 0); + + if (fino_cur) { + xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(fino_cur, 0); + } + + /* Reset the AGI counters now that we've changed the inode roots. */ + error = xrep_ibt_reset_counters(ri); + if (error) + goto err_finobt; + + /* Free unused blocks and bitmap. */ + if (need_finobt) { + error = xrep_newbt_commit(&ri->new_finobt); + if (error) + goto err_inobt; + } + error = xrep_newbt_commit(&ri->new_inobt); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_finocur: + if (need_finobt) + xfs_btree_del_cursor(fino_cur, error); +err_inocur: + xfs_btree_del_cursor(ino_cur, error); +err_finobt: + if (need_finobt) + xrep_newbt_cancel(&ri->new_finobt); +err_inobt: + xrep_newbt_cancel(&ri->new_inobt); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_ibt_remove_old_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + /* + * Free the old inode btree blocks if they're not in use. It's ok to + * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG + * reservation because we reset the reservation before releasing the + * AGI and AGF header buffer locks. + */ + error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, + &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); + if (error) + return error; + + /* + * If the finobt is enabled and has a per-AG reservation, make sure we + * reinitialize the per-AG reservations. + */ + if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) + sc->flags |= XREP_RESET_PERAG_RESV; + + return 0; +} + +/* Repair both inode btrees. */ +int +xrep_iallocbt( + struct xfs_scrub *sc) +{ + struct xrep_ibt *ri; + struct xfs_mount *mp = sc->mp; + char *descr; + xfs_agino_t first_agino, last_agino; + int error = 0; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); + if (!ri) + return -ENOMEM; + ri->sc = sc; + + /* We rebuild both inode btrees. */ + sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; + + /* Set up enough storage to handle an AG with nothing but inodes. */ + xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + last_agino /= XFS_INODES_PER_CHUNK; + descr = xchk_xfile_ag_descr(sc, "inode index records"); + error = xfarray_create(descr, last_agino, + sizeof(struct xfs_inobt_rec_incore), + &ri->inode_records); + kfree(descr); + if (error) + goto out_ri; + + /* Collect the inode data and find the old btree blocks. */ + xagb_bitmap_init(&ri->old_iallocbt_blocks); + error = xrep_ibt_find_inodes(ri); + if (error) + goto out_bitmap; + + /* Rebuild the inode indexes. */ + error = xrep_ibt_build_new_trees(ri); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_ibt_remove_old_trees(ri); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ri->old_iallocbt_blocks); + xfarray_destroy(ri->inode_records); +out_ri: + kfree(ri); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_iallocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; + error = xchk_iallocbt(sc); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; + error = xchk_iallocbt(sc); + } + +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h new file mode 100644 index 000000000000..1300833679ab --- /dev/null +++ b/fs/xfs/scrub/ino_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_INO_BITMAP_H__ +#define __XFS_SCRUB_INO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_ino_t */ + +struct xino_bitmap { + struct xbitmap64 inobitmap; +}; + +static inline void xino_bitmap_init(struct xino_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->inobitmap); +} + +static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->inobitmap); +} + +static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + return xbitmap64_set(&bitmap->inobitmap, ino, 1); +} + +static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + uint64_t len = 1; + + return xbitmap64_test(&bitmap->inobitmap, ino, &len); +} + +#endif /* __XFS_SCRUB_INO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index d03de74fd76f..d32716fb2fec 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -20,10 +20,12 @@ #include "xfs_reflink.h" #include "xfs_rmap.h" #include "xfs_bmap_util.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* Prepare the attached inode for scrubbing. */ static inline int @@ -38,6 +40,10 @@ xchk_prepare_iscrub( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -180,8 +186,11 @@ xchk_setup_inode( * saying the inode is allocated and the icache being unable to load * the inode until we can flag the corruption in xchk_inode. The * scrub function has to note the corruption, since we're not really - * supposed to do that from the setup function. + * supposed to do that from the setup function. Save the mapping to + * make repairs to the ondisk inode buffer. */ + if (xchk_could_repair(sc)) + xrep_setup_inode(sc, &imap); return 0; out_cancel: @@ -225,7 +234,7 @@ xchk_inode_extsize( */ if ((flags & XFS_DIFLAG_RTINHERIT) && (flags & XFS_DIFLAG_EXTSZINHERIT) && - value % sc->mp->m_sb.sb_rextsize > 0) + xfs_extlen_to_rtxmod(sc->mp, value) > 0) xchk_ino_set_warning(sc, ino); } @@ -551,7 +560,7 @@ xchk_dinode( } /* di_forkoff */ - if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); @@ -730,6 +739,23 @@ xchk_inode_check_reflink_iflag( xchk_ino_set_corrupt(sc, ino); } +/* + * If this inode has zero link count, it must be on the unlinked list. If + * it has nonzero link count, it must not be on the unlinked list. + */ +STATIC void +xchk_inode_check_unlinked( + struct xfs_scrub *sc) +{ + if (VFS_I(sc->ip)->i_nlink == 0) { + if (!xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } +} + /* Scrub an inode. */ int xchk_inode( @@ -762,6 +788,8 @@ xchk_inode( if (S_ISREG(VFS_I(sc->ip)->i_mode)) xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); + xchk_inode_check_unlinked(sc); + xchk_inode_xref(sc, sc->ip->i_ino, &di); out: return error; diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c new file mode 100644 index 000000000000..3e45b9b72312 --- /dev/null +++ b/fs/xfs/scrub/inode_repair.c @@ -0,0 +1,1901 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" +#include "xfs_health.h" +#include "xfs_symlink_remote.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" + +/* + * Inode Record Repair + * =================== + * + * Roughly speaking, inode problems can be classified based on whether or not + * they trip the dinode verifiers. If those trip, then we won't be able to + * xfs_iget ourselves the inode. + * + * Therefore, the xrep_dinode_* functions fix anything that will cause the + * inode buffer verifier or the dinode verifier. The xrep_inode_* functions + * fix things on live incore inodes. The inode repair functions make decisions + * with security and usability implications when reviving a file: + * + * - Files with zero di_mode or a garbage di_mode are converted to regular file + * that only root can read. This file may not actually contain user data, + * if the file was not previously a regular file. Setuid and setgid bits + * are cleared. + * + * - Zero-size directories can be truncated to look empty. It is necessary to + * run the bmapbtd and directory repair functions to fully rebuild the + * directory. + * + * - Zero-size symbolic link targets can be truncated to '?'. It is necessary + * to run the bmapbtd and symlink repair functions to salvage the symlink. + * + * - Invalid extent size hints will be removed. + * + * - Quotacheck will be scheduled if we repaired an inode that was so badly + * damaged that the ondisk inode had to be rebuilt. + * + * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. + * Setuid and setgid bits are cleared. + * + * - Data and attr forks are reset to extents format with zero extents if the + * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta + * repair functions to recover the space mapping. + * + * - ACLs will not be recovered if the attr fork is zapped or the extended + * attribute structure itself requires salvaging. + * + * - If the attr fork is zapped, the user and group ids are reset to root and + * the setuid and setgid bits are removed. + */ + +/* + * All the information we need to repair the ondisk inode if we can't iget the + * incore inode. We don't allocate this buffer unless we're going to perform + * a repair to the ondisk inode cluster buffer. + */ +struct xrep_inode { + /* Inode mapping that we saved from the initial lookup attempt. */ + struct xfs_imap imap; + + struct xfs_scrub *sc; + + /* Blocks in use on the data device by data extents or bmbt blocks. */ + xfs_rfsblock_t data_blocks; + + /* Blocks in use on the rt device. */ + xfs_rfsblock_t rt_blocks; + + /* Blocks in use by the attr fork. */ + xfs_rfsblock_t attr_blocks; + + /* Number of data device extents for the data fork. */ + xfs_extnum_t data_extents; + + /* + * Number of realtime device extents for the data fork. If + * data_extents and rt_extents indicate that the data fork has extents + * on both devices, we'll just back away slowly. + */ + xfs_extnum_t rt_extents; + + /* Number of (data device) extents for the attr fork. */ + xfs_aextnum_t attr_extents; + + /* Sick state to set after zapping parts of the inode. */ + unsigned int ino_sick_mask; + + /* Must we remove all access from this file? */ + bool zap_acls; + + /* Inode scanner to see if we can find the ftype from dirents */ + struct xchk_iscan ftype_iscan; + uint8_t alleged_ftype; +}; + +/* + * Setup function for inode repair. @imap contains the ondisk inode mapping + * information so that we can correct the ondisk inode cluster buffer if + * necessary to make iget work. + */ +int +xrep_setup_inode( + struct xfs_scrub *sc, + const struct xfs_imap *imap) +{ + struct xrep_inode *ri; + + sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + ri = sc->buf; + memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); + ri->sc = sc; + return 0; +} + +/* + * Make sure this ondisk inode can pass the inode buffer verifier. This is + * not the same as the dinode verifier. + */ +STATIC void +xrep_dinode_buf_core( + struct xfs_scrub *sc, + struct xfs_buf *bp, + unsigned int ioffset) +{ + struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + xfs_agino_t agino; + bool crc_ok = false; + bool magic_ok = false; + bool unlinked_ok = false; + + agino = be32_to_cpu(dip->di_next_unlinked); + + if (xfs_verify_agino_or_null(bp->b_pag, agino)) + unlinked_ok = true; + + if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + xfs_dinode_good_version(mp, dip->di_version)) + magic_ok = true; + + if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + crc_ok = true; + + if (magic_ok && unlinked_ok && crc_ok) + return; + + if (!magic_ok) { + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + dip->di_version = 3; + } + if (!unlinked_ok) + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_dinode_calc_crc(mp, dip); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(tp, bp, ioffset, + ioffset + sizeof(struct xfs_dinode) - 1); +} + +/* Make sure this inode cluster buffer can pass the inode buffer verifier. */ +STATIC void +xrep_dinode_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + int i; + int ni; + + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) + xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); +} + +/* Reinitialize things that never change in an inode. */ +STATIC void +xrep_dinode_header( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + trace_xrep_dinode_header(sc, dip); + + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + if (!xfs_dinode_good_version(sc->mp, dip->di_version)) + dip->di_version = 3; + dip->di_ino = cpu_to_be64(sc->sm->sm_ino); + uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); + dip->di_gen = cpu_to_be32(sc->sm->sm_gen); +} + +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_dinode_findmode_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (ino != sc->sm->sm_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Uhoh, more than one parent for this inode and they don't agree on + * the file type? + */ + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && + ri->alleged_ftype != name->type) { + trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, + ri->alleged_ftype); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember the ftype. */ + trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); + ri->alleged_ftype = name->type; + return 0; +} + +/* Try to lock a directory, or wait a jiffy. */ +static inline int +xrep_dinode_ilock_nowait( + struct xfs_inode *dp, + unsigned int lock_mode) +{ + if (xfs_ilock_nowait(dp, lock_mode)) + return true; + + schedule_timeout_killable(1); + return false; +} + +/* + * Try to lock a directory to look for ftype hints. Since we already hold the + * AGI buffer, we cannot block waiting for the ILOCK because rename can take + * the ILOCK and then try to lock AGIs. + */ +STATIC int +xrep_dinode_trylock_directory( + struct xrep_inode *ri, + struct xfs_inode *dp, + unsigned int *lock_modep) +{ + unsigned long deadline = jiffies + msecs_to_jiffies(30000); + unsigned int lock_mode; + int error = 0; + + do { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (xfs_need_iread_extents(&dp->i_df)) + lock_mode = XFS_ILOCK_EXCL; + else + lock_mode = XFS_ILOCK_SHARED; + + if (xrep_dinode_ilock_nowait(dp, lock_mode)) { + *lock_modep = lock_mode; + return 0; + } + } while (!time_is_before_jiffies(deadline)); + return -EBUSY; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_dinode_findmode_walk_directory( + struct xrep_inode *ri, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = ri->sc; + unsigned int lock_mode; + int error = 0; + + /* Ignore temporary repair directories. */ + if (xrep_is_tempfile(dp)) + return 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); + if (error) + return error; + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Try to find the mode of the inode being repaired by looking for directories + * that point down to this file. + */ +STATIC int +xrep_dinode_find_mode( + struct xrep_inode *ri, + uint16_t *mode) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_inode *dp; + int error; + + /* No ftype means we have no other metadata to consult. */ + if (!xfs_has_ftype(sc->mp)) { + *mode = S_IFREG; + return 0; + } + + /* + * Scan all directories for parents that might point down to this + * inode. Skip the inode being repaired during the scan since it + * cannot be its own parent. Note that we still hold the AGI locked + * so there's a real possibility that _iscan_iter can return EBUSY. + */ + xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); + ri->ftype_iscan.skip_ino = sc->sm->sm_ino; + ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; + while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { + if (S_ISDIR(VFS_I(dp)->i_mode)) + error = xrep_dinode_findmode_walk_directory(ri, dp); + xchk_iscan_mark_visited(&ri->ftype_iscan, dp); + xchk_irele(sc, dp); + if (error < 0) + break; + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&ri->ftype_iscan); + xchk_iscan_teardown(&ri->ftype_iscan); + + if (error == -EBUSY) { + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { + /* + * If we got an EBUSY after finding at least one + * dirent, that means the scan found an inode on the + * inactivation list and could not open it. Accept the + * alleged ftype and install a new mode below. + */ + error = 0; + } else if (!(sc->flags & XCHK_TRY_HARDER)) { + /* + * Otherwise, retry the operation one time to see if + * the reason for the delay is an inode from the same + * cluster buffer waiting on the inactivation list. + */ + error = -EDEADLOCK; + } + } + if (error) + return error; + + /* + * Convert the discovered ftype into the file mode. If all else fails, + * return S_IFREG. + */ + switch (ri->alleged_ftype) { + case XFS_DIR3_FT_DIR: + *mode = S_IFDIR; + break; + case XFS_DIR3_FT_WHT: + case XFS_DIR3_FT_CHRDEV: + *mode = S_IFCHR; + break; + case XFS_DIR3_FT_BLKDEV: + *mode = S_IFBLK; + break; + case XFS_DIR3_FT_FIFO: + *mode = S_IFIFO; + break; + case XFS_DIR3_FT_SOCK: + *mode = S_IFSOCK; + break; + case XFS_DIR3_FT_SYMLINK: + *mode = S_IFLNK; + break; + default: + *mode = S_IFREG; + break; + } + return 0; +} + +/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ +STATIC int +xrep_dinode_mode( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint16_t mode = be16_to_cpu(dip->di_mode); + int error; + + trace_xrep_dinode_mode(sc, dip); + + if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) + return 0; + + /* Try to fix the mode. If we cannot, then leave everything alone. */ + error = xrep_dinode_find_mode(ri, &mode); + switch (error) { + case -EINTR: + case -EBUSY: + case -EDEADLOCK: + /* temporary failure or fatal signal */ + return error; + case 0: + /* found mode */ + break; + default: + /* some other error, assume S_IFREG */ + mode = S_IFREG; + break; + } + + /* bad mode, so we set it to a file that only root can read */ + dip->di_mode = cpu_to_be16(mode); + dip->di_uid = 0; + dip->di_gid = 0; + ri->zap_acls = true; + return 0; +} + +/* Fix unused link count fields having nonzero values. */ +STATIC void +xrep_dinode_nlinks( + struct xfs_dinode *dip) +{ + if (dip->di_version > 1) + dip->di_onlink = 0; + else + dip->di_nlink = 0; +} + +/* Fix any conflicting flags that the verifiers complain about. */ +STATIC void +xrep_dinode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + bool isrt) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_flags(sc, dip); + + if (isrt) + flags |= XFS_DIFLAG_REALTIME; + else + flags &= ~XFS_DIFLAG_REALTIME; + + /* + * For regular files on a reflink filesystem, set the REFLINK flag to + * protect shared extents. A later stage will actually check those + * extents and clear the flag if possible. + */ + if (xfs_has_reflink(mp) && S_ISREG(mode)) + flags2 |= XFS_DIFLAG2_REFLINK; + else + flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); + if (flags & XFS_DIFLAG_REALTIME) + flags2 &= ~XFS_DIFLAG2_REFLINK; + if (!xfs_has_bigtime(mp)) + flags2 &= ~XFS_DIFLAG2_BIGTIME; + if (!xfs_has_large_extent_counts(mp)) + flags2 &= ~XFS_DIFLAG2_NREXT64; + if (flags2 & XFS_DIFLAG2_NREXT64) + dip->di_nrext64_pad = 0; + else if (dip->di_version >= 3) + dip->di_v3_pad = 0; + dip->di_flags = cpu_to_be16(flags); + dip->di_flags2 = cpu_to_be64(flags2); +} + +/* + * Blow out symlink; now it points nowhere. We don't have to worry about + * incore state because this inode is failing the verifiers. + */ +STATIC void +xrep_dinode_zap_symlink( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + char *p; + + trace_xrep_dinode_zap_symlink(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + dip->di_size = cpu_to_be64(1); + p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + *p = '?'; + ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; +} + +/* + * Blow out dir, make the parent point to the root. In the future repair will + * reconstruct this directory for us. Note that there's no in-core directory + * inode because the sf verifier tripped, so we don't have to worry about the + * dentry cache. + */ +STATIC void +xrep_dinode_zap_dir( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dir2_sf_hdr *sfp; + int i8count; + + trace_xrep_dinode_zap_dir(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; + sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + sfp->count = 0; + sfp->i8count = i8count; + xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); + dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); + ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; +} + +/* Make sure we don't have a garbage file size. */ +STATIC void +xrep_dinode_size( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint64_t size = be64_to_cpu(dip->di_size); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_size(sc, dip); + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + /* di_size can't be nonzero for special files */ + dip->di_size = 0; + break; + case S_IFREG: + /* Regular files can't be larger than 2^63-1 bytes. */ + dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); + break; + case S_IFLNK: + /* + * Truncate ridiculously oversized symlinks. If the size is + * zero, reset it to point to the current directory. Both of + * these conditions trigger dinode verifier errors, so there + * is no in-core state to reset. + */ + if (size > XFS_SYMLINK_MAXLEN) + dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); + else if (size == 0) + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + /* + * Directories can't have a size larger than 32G. If the size + * is zero, reset it to an empty directory. Both of these + * conditions trigger dinode verifier errors, so there is no + * in-core state to reset. + */ + if (size > XFS_DIR2_SPACE_SIZE) + dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); + else if (size == 0) + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* Fix extent size hints. */ +STATIC void +xrep_dinode_extsize_hints( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + xfs_failaddr_t fa; + + trace_xrep_dinode_extsize_hints(sc, dip); + + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) { + dip->di_extsize = 0; + dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + } + + if (dip->di_version < 3) + return; + + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) { + dip->di_cowextsize = 0; + dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); + } +} + +/* Count extents and blocks for an inode given an rmap. */ +STATIC int +xrep_dinode_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { + ri->attr_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->attr_extents++; + + return 0; + } + + ri->data_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->data_extents++; + + return 0; +} + +/* Count extents and blocks for an inode from all AG rmap data. */ +STATIC int +xrep_dinode_count_ag_rmaps( + struct xrep_inode *ri, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); + error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(ri->sc->tp, agf); + return error; +} + +/* Count extents and blocks for a given inode from all rmap data. */ +STATIC int +xrep_dinode_count_rmaps( + struct xrep_inode *ri) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + return -EOPNOTSUPP; + + for_each_perag(ri->sc->mp, agno, pag) { + error = xrep_dinode_count_ag_rmaps(ri, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* Can't have extents on both the rt and the data device. */ + if (ri->data_extents && ri->rt_extents) + return -EFSCORRUPTED; + + trace_xrep_dinode_count_rmaps(ri->sc, + ri->data_blocks, ri->rt_blocks, ri->attr_blocks, + ri->data_extents, ri->rt_extents, ri->attr_extents); + return 0; +} + +/* Return true if this extents-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_extents_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmbt_irec new; + struct xfs_bmbt_rec *dp; + xfs_extnum_t nex; + bool isrt; + unsigned int i; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + dp = XFS_DFORK_PTR(dip, whichfork); + + isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); + for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(dp, &new); + fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, + &new); + if (fa) + return true; + } + + return false; +} + +/* Return true if this btree-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_bmbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmdr_block *dfp; + xfs_extnum_t nex; + unsigned int i; + unsigned int dmxr; + unsigned int nrecs; + unsigned int level; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + if (dfork_size < sizeof(struct xfs_bmdr_block)) + return true; + + dfp = XFS_DFORK_PTR(dip, whichfork); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size) + return true; + if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) + return true; + + dmxr = xfs_bmdr_maxrecs(dfork_size, 0); + for (i = 1; i <= nrecs; i++) { + struct xfs_bmbt_key *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_fileoff_t fileoff; + xfs_fsblock_t fsbno; + + fkp = xfs_bmdr_key_addr(dfp, i); + fileoff = be64_to_cpu(fkp->br_startoff); + if (!xfs_verify_fileoff(sc->mp, fileoff)) + return true; + + fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr); + fsbno = be64_to_cpu(*fpp); + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return true; + } + + return false; +} + +/* + * Check the data fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_dfork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + uint16_t mode) +{ + void *dfork_ptr; + int64_t data_size; + unsigned int fmt; + unsigned int dfork_size; + + /* + * Verifier functions take signed int64_t, so check for bogus negative + * values first. + */ + data_size = be64_to_cpu(dip->di_size); + if (data_size < 0) + return true; + + fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (fmt != XFS_DINODE_FMT_DEV) + return true; + break; + case S_IFREG: + if (fmt == XFS_DINODE_FMT_LOCAL) + return true; + fallthrough; + case S_IFLNK: + case S_IFDIR: + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return true; + } + break; + default: + return true; + } + + dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); + dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + switch (fmt) { + case XFS_DINODE_FMT_DEV: + break; + case XFS_DINODE_FMT_LOCAL: + /* dir/symlink structure cannot be larger than the fork */ + if (data_size > dfork_size) + return true; + /* directory structure must pass verification. */ + if (S_ISDIR(mode) && + xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) + return true; + /* symlink structure must pass verification. */ + if (S_ISLNK(mode) && + xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) + return true; + break; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +static void +xrep_dinode_set_data_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_nextents = cpu_to_be64(nextents); + else + dip->di_nextents = cpu_to_be32(nextents); +} + +static void +xrep_dinode_set_attr_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_anextents = cpu_to_be32(nextents); + else + dip->di_anextents = cpu_to_be16(nextents); +} + +/* Reset the data fork to something sane. */ +STATIC void +xrep_dinode_zap_dfork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_dfork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; + + xrep_dinode_set_data_nextents(dip, 0); + ri->data_blocks = 0; + ri->rt_blocks = 0; + + /* Special files always get reset to DEV */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + dip->di_format = XFS_DINODE_FMT_DEV; + dip->di_size = 0; + return; + } + + /* + * If we have data extents, reset to an empty map and hope the user + * will run the bmapbtd checker next. + */ + if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { + dip->di_format = XFS_DINODE_FMT_EXTENTS; + return; + } + + /* Otherwise, reset the local format to the minimum. */ + switch (mode & S_IFMT) { + case S_IFLNK: + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* + * Check the attr fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_afork( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_attr_sf_hdr *afork_ptr; + size_t attr_size; + unsigned int afork_size; + + if (XFS_DFORK_BOFF(dip) == 0) + return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || + xfs_dfork_attr_extents(dip) != 0; + + afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + + switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { + case XFS_DINODE_FMT_LOCAL: + /* Fork has to be large enough to extract the xattr size. */ + if (afork_size < sizeof(struct xfs_attr_sf_hdr)) + return true; + + /* xattr structure cannot be larger than the fork */ + attr_size = be16_to_cpu(afork_ptr->totsize); + if (attr_size > afork_size) + return true; + + /* xattr structure must pass verification. */ + return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +/* + * Reset the attr fork to empty. Since the attr fork could have contained + * ACLs, make the file readable only by root. + */ +STATIC void +xrep_dinode_zap_afork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_afork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; + + dip->di_aformat = XFS_DINODE_FMT_EXTENTS; + xrep_dinode_set_attr_nextents(dip, 0); + ri->attr_blocks = 0; + + /* + * If the data fork is in btree format, removing the attr fork entirely + * might cause verifier failures if the next level down in the bmbt + * could now fit in the data fork area. + */ + if (dip->di_format != XFS_DINODE_FMT_BTREE) + dip->di_forkoff = 0; + dip->di_mode = cpu_to_be16(mode & ~0777); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Make sure the fork offset is a sensible value. */ +STATIC void +xrep_dinode_ensure_forkoff( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_bmdr_block *bmdr; + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t attr_extents, data_extents; + size_t bmdr_minsz = xfs_bmdr_space_calc(1); + unsigned int lit_sz = XFS_LITINO(sc->mp); + unsigned int afork_min, dfork_min; + + trace_xrep_dinode_ensure_forkoff(sc, dip); + + /* + * Before calling this function, xrep_dinode_core ensured that both + * forks actually fit inside their respective literal areas. If this + * was not the case, the fork was reset to FMT_EXTENTS with zero + * records. If the rmapbt scan found attr or data fork blocks, this + * will be noted in the dinode_stats, and we must leave enough room + * for the bmap repair code to reconstruct the mapping structure. + * + * First, compute the minimum space required for the attr fork. + */ + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform xattr structure at all, that + * means the attr fork area was exactly large enough to fit + * the sf structure. + */ + afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_EXTENTS: + attr_extents = xfs_dfork_attr_extents(dip); + if (attr_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; + } else if (ri->attr_extents > 0) { + /* + * The attr fork thinks it has zero extents, but we + * found some xattr extents. We need to leave enough + * empty space here so that the incore attr fork will + * get created (and hence trigger the attr fork bmap + * repairer). + */ + afork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + afork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + afork_min = xfs_bmap_broot_space(sc->mp, bmdr); + break; + default: + /* We should never see any other formats. */ + afork_min = 0; + break; + } + + /* Compute the minimum space required for the data fork. */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + dfork_min = sizeof(__be32); + break; + case XFS_DINODE_FMT_UUID: + dfork_min = sizeof(uuid_t); + break; + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform data fork at all, that means + * the data fork area was large enough to fit whatever was in + * there. + */ + dfork_min = be64_to_cpu(dip->di_size); + break; + case XFS_DINODE_FMT_EXTENTS: + data_extents = xfs_dfork_data_extents(dip); + if (data_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; + } else if (ri->data_extents > 0 || ri->rt_extents > 0) { + /* + * The data fork thinks it has zero extents, but we + * found some data extents. We need to leave enough + * empty space here so that the data fork bmap repair + * will recover the mappings. + */ + dfork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + dfork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); + break; + default: + dfork_min = 0; + break; + } + + /* + * Round all values up to the nearest 8 bytes, because that is the + * precision of di_forkoff. + */ + afork_min = roundup(afork_min, 8); + dfork_min = roundup(dfork_min, 8); + bmdr_minsz = roundup(bmdr_minsz, 8); + + ASSERT(dfork_min <= lit_sz); + ASSERT(afork_min <= lit_sz); + + /* + * If the data fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork up. + */ + if (dip->di_format == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_data_extents(dip) == 0 && + (ri->data_extents > 0 || ri->rt_extents > 0) && + bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { + if (bmdr_minsz + afork_min > lit_sz) { + /* + * The attr for and the stub fork we need to recover + * the data fork won't both fit. Zap the attr fork. + */ + xrep_dinode_zap_afork(ri, dip, mode); + afork_min = bmdr_minsz; + } else { + void *before, *after; + + /* Otherwise, just slide the attr fork up. */ + before = XFS_DFORK_APTR(dip); + dip->di_forkoff = bmdr_minsz >> 3; + after = XFS_DFORK_APTR(dip); + memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); + } + } + + /* + * If the attr fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork down. + */ + if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_attr_extents(dip) == 0 && + ri->attr_extents > 0 && + bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { + if (dip->di_format == XFS_DINODE_FMT_BTREE) { + /* + * If the data fork is in btree format then we can't + * adjust forkoff because that runs the risk of + * violating the extents/btree format transition rules. + */ + } else if (bmdr_minsz + dfork_min > lit_sz) { + /* + * If we can't move the attr fork, too bad, we lose the + * attr fork and leak its blocks. + */ + xrep_dinode_zap_afork(ri, dip, mode); + } else { + /* + * Otherwise, just slide the attr fork down. The attr + * fork is empty, so we don't have any old contents to + * move here. + */ + dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; + } + } +} + +/* + * Zap the data/attr forks if we spot anything that isn't going to pass the + * ifork verifiers or the ifork formatters, because we need to get the inode + * into good enough shape that the higher level repair functions can run. + */ +STATIC void +xrep_dinode_zap_forks( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t data_extents; + xfs_extnum_t attr_extents; + xfs_filblks_t nblocks; + uint16_t mode; + bool zap_datafork = false; + bool zap_attrfork = ri->zap_acls; + + trace_xrep_dinode_zap_forks(sc, dip); + + mode = be16_to_cpu(dip->di_mode); + + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + + /* Inode counters don't make sense? */ + if (data_extents > nblocks) + zap_datafork = true; + if (attr_extents > nblocks) + zap_attrfork = true; + if (data_extents + attr_extents > nblocks) + zap_datafork = zap_attrfork = true; + + if (!zap_datafork) + zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); + if (!zap_attrfork) + zap_attrfork = xrep_dinode_check_afork(sc, dip); + + /* Zap whatever's bad. */ + if (zap_attrfork) + xrep_dinode_zap_afork(ri, dip, mode); + if (zap_datafork) + xrep_dinode_zap_dfork(ri, dip, mode); + xrep_dinode_ensure_forkoff(ri, dip, mode); + + /* + * Zero di_nblocks if we don't have any extents at all to satisfy the + * buffer verifier. + */ + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + if (data_extents + attr_extents == 0) + dip->di_nblocks = 0; +} + +/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ +STATIC int +xrep_dinode_core( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_buf *bp; + struct xfs_dinode *dip; + xfs_ino_t ino = sc->sm->sm_ino; + int error; + int iget_error; + + /* Figure out what this inode had mapped in both forks. */ + error = xrep_dinode_count_rmaps(ri); + if (error) + return error; + + /* Read the inode cluster buffer. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (error) + return error; + + /* Make sure we can pass the inode buffer verifier. */ + xrep_dinode_buf(sc, bp); + bp->b_ops = &xfs_inode_buf_ops; + + /* Fix everything the verifier will complain about. */ + dip = xfs_buf_offset(bp, ri->imap.im_boffset); + xrep_dinode_header(sc, dip); + iget_error = xrep_dinode_mode(ri, dip); + if (iget_error) + goto write; + xrep_dinode_nlinks(dip); + xrep_dinode_flags(sc, dip, ri->rt_extents > 0); + xrep_dinode_size(ri, dip); + xrep_dinode_extsize_hints(sc, dip); + xrep_dinode_zap_forks(ri, dip); + +write: + /* Write out the inode. */ + trace_xrep_dinode_fixed(sc, dip); + xfs_dinode_calc_crc(sc->mp, dip); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, + ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); + + /* + * In theory, we've fixed the ondisk inode record enough that we should + * be able to load the inode into the cache. Try to iget that inode + * now while we hold the AGI and the inode cluster buffer and take the + * IOLOCK so that we can continue with repairs without anyone else + * accessing the inode. If iget fails, we still need to commit the + * changes. + */ + if (!iget_error) + iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * Commit the inode cluster buffer updates and drop the AGI buffer that + * we've been holding since scrub setup. From here on out, repairs + * deal only with the cached inode. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + if (iget_error) + return iget_error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xrep_ino_dqattach(sc); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (ri->ino_sick_mask) + xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); + return 0; +} + +/* Fix everything xfs_dinode_verify cares about. */ +STATIC int +xrep_dinode_problems( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_dinode_core(ri); + if (error) + return error; + + /* We had to fix a totally busted inode, schedule quotacheck. */ + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + + return 0; +} + +/* + * Fix problems that the verifiers don't care about. In general these are + * errors that don't cause problems elsewhere in the kernel that we can easily + * detect, so we don't check them all that rigorously. + */ + +/* Make sure block and extent counts are ok. */ +STATIC int +xrep_inode_blockcounts( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + xfs_filblks_t count; + xfs_filblks_t acount; + xfs_extnum_t nextents; + int error; + + trace_xrep_inode_blockcounts(sc); + + /* Set data fork counters from the data fork mappings. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (error) + return error; + if (xfs_is_reflink_inode(sc->ip)) { + /* + * data fork blockcount can exceed physical storage if a user + * reflinks the same block over and over again. + */ + ; + } else if (XFS_IS_REALTIME_INODE(sc->ip)) { + if (count >= sc->mp->m_sb.sb_rblocks) + return -EFSCORRUPTED; + } else { + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + } + error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); + if (error) + return error; + sc->ip->i_df.if_nextents = nextents; + + /* Set attr fork counters from the attr fork mappings. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (ifp) { + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (error) + return error; + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, + nextents); + if (error) + return error; + ifp->if_nextents = nextents; + } else { + acount = 0; + } + + sc->ip->i_nblocks = count + acount; + return 0; +} + +/* Check for invalid uid/gid/prid. */ +STATIC void +xrep_inode_ids( + struct xfs_scrub *sc) +{ + bool dirty = false; + + trace_xrep_inode_ids(sc); + + if (!uid_valid(VFS_I(sc->ip)->i_uid)) { + i_uid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + } + + if (!gid_valid(VFS_I(sc->ip)->i_gid)) { + i_gid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + } + + if (sc->ip->i_projid == -1U) { + sc->ip->i_projid = 0; + dirty = true; + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + } + + /* strip setuid/setgid if we touched any of the ids */ + if (dirty) + VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); +} + +static inline void +xrep_clamp_timestamp( + struct xfs_inode *ip, + struct timespec64 *ts) +{ + ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); + *ts = timestamp_truncate(*ts, VFS_I(ip)); +} + +/* Nanosecond counters can't have more than 1 billion. */ +STATIC void +xrep_inode_timestamps( + struct xfs_inode *ip) +{ + struct timespec64 tstamp; + struct inode *inode = VFS_I(ip); + + tstamp = inode_get_atime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_atime_to_ts(inode, tstamp); + + tstamp = inode_get_mtime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_mtime_to_ts(inode, tstamp); + + tstamp = inode_get_ctime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_ctime_to_ts(inode, tstamp); + + xrep_clamp_timestamp(ip, &ip->i_crtime); +} + +/* Fix inode flags that don't make sense together. */ +STATIC void +xrep_inode_flags( + struct xfs_scrub *sc) +{ + uint16_t mode; + + trace_xrep_inode_flags(sc); + + mode = VFS_I(sc->ip)->i_mode; + + /* Clear junk flags */ + if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) + sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; + + /* NEWRTBM only applies to realtime bitmaps */ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) + sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + else + sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; + + /* These only make sense for directories. */ + if (!S_ISDIR(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS); + + /* These only make sense for files. */ + if (!S_ISREG(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | + XFS_DIFLAG_EXTSIZE); + + /* These only make sense for non-rt files. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; + + /* Immutable and append only? Drop the append. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && + (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) + sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; + + /* Clear junk flags. */ + if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; + + /* No reflink flag unless we support it and it's a file. */ + if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* DAX only applies to files and dirs. */ + if (!(S_ISREG(mode) || S_ISDIR(mode))) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + + /* No reflink files on the realtime device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; +} + +/* + * Fix size problems with block/node format directories. If we fail to find + * the extent list, just bail out and let the bmapbtd repair functions clean + * up that mess. + */ +STATIC void +xrep_inode_blockdir_size( + struct xfs_scrub *sc) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_fileoff_t off; + int error; + + trace_xrep_inode_blockdir_size(sc); + + error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); + if (error) + return; + + /* Find the last block before 32G; this is the dir size. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); + if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { + /* zero-extents directory? */ + return; + } + + off = got.br_startoff + got.br_blockcount; + sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, + XFS_FSB_TO_B(sc->mp, off)); +} + +/* Fix size problems with short format directories. */ +STATIC void +xrep_inode_sfdir_size( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + trace_xrep_inode_sfdir_size(sc); + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + sc->ip->i_disk_size = ifp->if_bytes; +} + +/* + * Fix any irregularities in a directory inode's size now that we can iterate + * extent maps and access other regular inode data. + */ +STATIC void +xrep_inode_dir_size( + struct xfs_scrub *sc) +{ + trace_xrep_inode_dir_size(sc); + + switch (sc->ip->i_df.if_format) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + xrep_inode_blockdir_size(sc); + break; + case XFS_DINODE_FMT_LOCAL: + xrep_inode_sfdir_size(sc); + break; + } +} + +/* Fix extent size hint problems. */ +STATIC void +xrep_inode_extsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { + sc->ip->i_extsize = 0; + sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; + } +} + +/* Ensure this file has an attr fork if it needs to hold a parent pointer. */ +STATIC int +xrep_inode_pptr( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct inode *inode = VFS_I(ip); + + if (!xfs_has_parent(mp)) + return 0; + + /* + * Unlinked inodes that cannot be added to the directory tree will not + * have a parent pointer. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return 0; + + /* The root directory doesn't have a parent pointer. */ + if (ip == mp->m_rootip) + return 0; + + /* + * Metadata inodes are rooted in the superblock and do not have any + * parents. + */ + if (xfs_is_metadata_inode(ip)) + return 0; + + /* Inode already has an attr fork; no further work possible here. */ + if (xfs_inode_has_attr_fork(ip)) + return 0; + + return xfs_bmap_add_attrfork(sc->tp, ip, + sizeof(struct xfs_attr_sf_hdr), true); +} + +/* Fix any irregularities in an inode that the verifiers don't catch. */ +STATIC int +xrep_inode_problems( + struct xfs_scrub *sc) +{ + int error; + + error = xrep_inode_blockcounts(sc); + if (error) + return error; + error = xrep_inode_pptr(sc); + if (error) + return error; + xrep_inode_timestamps(sc->ip); + xrep_inode_flags(sc); + xrep_inode_ids(sc); + /* + * We can now do a better job fixing the size of a directory now that + * we can scan the data fork extents than we could in xrep_dinode_size. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + xrep_inode_dir_size(sc); + xrep_inode_extsize(sc); + + trace_xrep_inode_fixed(sc); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* + * Make sure this inode's unlinked list pointers are consistent with its + * link count. + */ +STATIC int +xrep_inode_unlinked( + struct xfs_scrub *sc) +{ + unsigned int nlink = VFS_I(sc->ip)->i_nlink; + int error; + + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, + XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + if (error) + return error; + } + + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { + error = xfs_iunlink(sc->tp, sc->ip); + if (error) + return error; + } + + return 0; +} + +/* Repair an inode's fields. */ +int +xrep_inode( + struct xfs_scrub *sc) +{ + int error = 0; + + /* + * No inode? That means we failed the _iget verifiers. Repair all + * the things that the inode verifiers care about, then retry _iget. + */ + if (!sc->ip) { + struct xrep_inode *ri = sc->buf; + + ASSERT(ri != NULL); + + error = xrep_dinode_problems(ri); + if (error == -EBUSY) { + /* + * Directory scan to recover inode mode encountered a + * busy inode, so we did not continue repairing things. + */ + return 0; + } + if (error) + return error; + + /* By this point we had better have a working incore inode. */ + if (!sc->ip) + return -EFSCORRUPTED; + } + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* If we found corruption of any kind, try to fix it. */ + if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { + error = xrep_inode_problems(sc); + if (error) + return error; + } + + /* See if we can clear the reflink flag. */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* Reconnect incore unlinked list */ + error = xrep_inode_unlinked(sc); + if (error) + return error; + + return xrep_defer_finish(sc); +} diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c new file mode 100644 index 000000000000..cf9d983667ce --- /dev/null +++ b/fs/xfs/scrub/iscan.c @@ -0,0 +1,826 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_icache.h" +#include "scrub/scrub.h" +#include "scrub/iscan.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Live File Scan + * ============== + * + * Live file scans walk every inode in a live filesystem. This is more or + * less like a regular iwalk, except that when we're advancing the scan cursor, + * we must ensure that inodes cannot be added or deleted anywhere between the + * old cursor value and the new cursor value. If we're advancing the cursor + * by one inode, the caller must hold that inode; if we're finding the next + * inode to scan, we must grab the AGI and hold it until we've updated the + * scan cursor. + * + * Callers are expected to use this code to scan all files in the filesystem to + * construct a new metadata index of some kind. The scan races against other + * live updates, which means there must be a provision to update the new index + * when updates are made to inodes that already been scanned. The iscan lock + * can be used in live update hook code to stop the scan and protect this data + * structure. + * + * To keep the new index up to date with other metadata updates being made to + * the live filesystem, it is assumed that the caller will add hooks as needed + * to be notified when a metadata update occurs. The inode scanner must tell + * the hook code when an inode has been visited with xchk_iscan_mark_visit. + * Hook functions can use xchk_iscan_want_live_update to decide if the + * scanner's observations must be updated. + */ + +/* + * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so + * that the scan ignores that inode. + */ +STATIC void +xchk_iscan_mask_skipino( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_inobt_rec_incore *rec, + xfs_agino_t lastrecino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); + xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); + + if (pag->pag_agno != skip_agno) + return; + if (skip_agino < rec->ir_startino) + return; + if (skip_agino > lastrecino) + return; + + rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1); +} + +/* + * Set *cursor to the next allocated inode after whatever it's set to now. + * If there are no more inodes in this AG, cursor is set to NULLAGINO. + */ +STATIC int +xchk_iscan_find_next( + struct xchk_iscan *iscan, + struct xfs_buf *agi_bp, + struct xfs_perag *pag, + xfs_inofree_t *allocmaskp, + xfs_agino_t *cursor, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + xfs_agnumber_t agno = pag->pag_agno; + xfs_agino_t lastino = NULLAGINO; + xfs_agino_t first, last; + xfs_agino_t agino = *cursor; + int has_rec; + int error; + + /* If the cursor is beyond the end of this AG, move to the next one. */ + xfs_agino_range(mp, agno, &first, &last); + if (agino > last) { + *cursor = NULLAGINO; + return 0; + } + + /* + * Look up the inode chunk for the current cursor position. If there + * is no chunk here, we want the next one. + */ + cur = xfs_inobt_init_cursor(pag, tp, agi_bp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); + if (!error && !has_rec) + error = xfs_btree_increment(cur, 0, &has_rec); + for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { + xfs_inofree_t allocmask; + + /* + * If we've run out of inobt records in this AG, move the + * cursor on to the next AG and exit. The caller can try + * again with the next AG. + */ + if (!has_rec) { + *cursor = NULLAGINO; + break; + } + + error = xfs_inobt_get_rec(cur, &rec, &has_rec); + if (error) + break; + if (!has_rec) { + error = -EFSCORRUPTED; + break; + } + + /* Make sure that we always move forward. */ + if (lastino != NULLAGINO && + XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { + error = -EFSCORRUPTED; + break; + } + lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; + + /* + * If this record only covers inodes that come before the + * cursor, advance to the next record. + */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + continue; + + if (iscan->skip_ino) + xchk_iscan_mask_skipino(iscan, pag, &rec, lastino); + + /* + * If the incoming lookup put us in the middle of an inobt + * record, mark it and the previous inodes "free" so that the + * search for allocated inodes will start at the cursor. + * We don't care about ir_freecount here. + */ + if (agino >= rec.ir_startino) + rec.ir_free |= xfs_inobt_maskn(0, + agino + 1 - rec.ir_startino); + + /* + * If there are allocated inodes in this chunk, find them + * and update the scan cursor. + */ + allocmask = ~rec.ir_free; + if (hweight64(allocmask) > 0) { + int next = xfs_lowbit64(allocmask); + + ASSERT(next >= 0); + *cursor = rec.ir_startino + next; + *allocmaskp = allocmask >> next; + *nr_inodesp = XFS_INODES_PER_CHUNK - next; + break; + } + } + + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Advance both the scan and the visited cursors. + * + * The inumber address space for a given filesystem is sparse, which means that + * the scan cursor can jump a long ways in a single iter() call. There are no + * inodes in these sparse areas, so we must move the visited cursor forward at + * the same time so that the scan user can receive live updates for inodes that + * may get created once we release the AGI buffer. + */ +static inline void +xchk_iscan_move_cursor( + struct xchk_iscan *iscan, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t cursor, visited; + + BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO); + + /* + * Special-case ino == 0 here so that we never set visited_ino to + * NULLFSINO when wrapping around EOFS, for that will let through all + * live updates. + */ + cursor = XFS_AGINO_TO_INO(mp, agno, agino); + if (cursor == 0) + visited = XFS_MAXINUMBER; + else + visited = cursor - 1; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = cursor; + iscan->__visited_ino = visited; + trace_xchk_iscan_move_cursor(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Prepare to return agno/agino to the iscan caller by moving the lastino + * cursor to the previous inode. Do this while we still hold the AGI so that + * no other threads can create or delete inodes in this AG. + */ +static inline void +xchk_iscan_finish( + struct xchk_iscan *iscan) +{ + mutex_lock(&iscan->lock); + iscan->cursor_ino = NULLFSINO; + + /* All live updates will be applied from now on */ + iscan->__visited_ino = NULLFSINO; + + mutex_unlock(&iscan->lock); +} + +/* Mark an inode scan finished before we actually scan anything. */ +void +xchk_iscan_finish_early( + struct xchk_iscan *iscan) +{ + ASSERT(iscan->cursor_ino == iscan->scan_start_ino); + ASSERT(iscan->__visited_ino == iscan->scan_start_ino); + + xchk_iscan_finish(iscan); +} + +/* + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, + * or the usual negative errno. + */ +STATIC int +xchk_iscan_read_agi( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf **agi_bpp) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned long relax; + int ret; + + if (!xchk_iscan_agi_needs_trylock(iscan)) + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); + + relax = msecs_to_jiffies(iscan->iget_retry_delay); + do { + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, + agi_bpp); + if (ret != -EAGAIN) + return ret; + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + trace_xchk_iscan_agi_retry_wait(iscan); + } while (!schedule_timeout_killable(relax) && + !xchk_iscan_aborted(iscan)); + return -ECANCELED; +} + +/* + * Advance ino to the next inode that the inobt thinks is allocated, being + * careful to jump to the next AG if we've reached the right end of this AG's + * inode btree. Advancing ino effectively means that we've pushed the inode + * scan forward, so set the iscan cursor to (ino - 1) so that our live update + * predicates will track inode allocations in that part of the inode number + * key space once we release the AGI buffer. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, or the usual negative errno. + */ +STATIC int +xchk_iscan_advance( + struct xchk_iscan *iscan, + struct xfs_perag **pagp, + struct xfs_buf **agi_bpp, + xfs_inofree_t *allocmaskp, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + xfs_agino_t agino; + int ret; + + ASSERT(iscan->cursor_ino >= iscan->__visited_ino); + + do { + if (xchk_iscan_aborted(iscan)) + return -ECANCELED; + + agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ECANCELED; + + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); + if (ret) + goto out_pag; + + agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); + ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, + &agino, nr_inodesp); + if (ret) + goto out_buf; + + if (agino != NULLAGINO) { + /* + * Found the next inode in this AG, so return it along + * with the AGI buffer and the perag structure to + * ensure it cannot go away. + */ + xchk_iscan_move_cursor(iscan, agno, agino); + *agi_bpp = agi_bp; + *pagp = pag; + return 1; + } + + /* + * Did not find any more inodes in this AG, move on to the next + * AG. + */ + agno = (agno + 1) % mp->m_sb.sb_agcount; + xchk_iscan_move_cursor(iscan, agno, 0); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + trace_xchk_iscan_advance_ag(iscan); + } while (iscan->cursor_ino != iscan->scan_start_ino); + + xchk_iscan_finish(iscan); + return 0; + +out_buf: + xfs_trans_brelse(sc->tp, agi_bp); +out_pag: + xfs_perag_put(pag); + return ret; +} + +/* + * Grabbing the inode failed, so we need to back up the scan and ask the caller + * to try to _advance the scan again. Returns -EBUSY if we've run out of retry + * opportunities, -ECANCELED if the process has a fatal signal pending, or + * -EAGAIN if we should try again. + */ +STATIC int +xchk_iscan_iget_retry( + struct xchk_iscan *iscan, + bool wait) +{ + ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); + + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + if (wait) { + unsigned long relax; + + /* + * Sleep for a period of time to let the rest of the system + * catch up. If we return early, someone sent a kill signal to + * the calling process. + */ + relax = msecs_to_jiffies(iscan->iget_retry_delay); + trace_xchk_iscan_iget_retry_wait(iscan); + + if (schedule_timeout_killable(relax) || + xchk_iscan_aborted(iscan)) + return -ECANCELED; + } + + iscan->cursor_ino--; + return -EAGAIN; +} + +/* + * For an inode scan, we hold the AGI and want to try to grab a batch of + * inodes. Holding the AGI prevents inodegc from clearing freed inodes, + * so we must use noretry here. For every inode after the first one in the + * batch, we don't want to wait, so we use retry there too. Finally, use + * dontcache to avoid polluting the cache. + */ +#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE) + +/* + * Grab an inode as part of an inode scan. While scanning this inode, the + * caller must ensure that no other threads can modify the inode until a call + * to xchk_iscan_visit succeeds. + * + * Returns the number of incore inodes grabbed; -EAGAIN if the caller should + * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode; + * -ECANCELED if there's a fatal signal pending; or some other negative errno. + */ +STATIC int +xchk_iscan_iget( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf *agi_bp, + xfs_inofree_t allocmask, + uint8_t nr_inodes) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t ino = iscan->cursor_ino; + unsigned int idx = 0; + unsigned int i; + int error; + + ASSERT(iscan->__inodes[0] == NULL); + + /* Fill the first slot in the inode array. */ + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, + &iscan->__inodes[idx]); + + trace_xchk_iscan_iget(iscan, error); + + if (error == -ENOENT || error == -EAGAIN) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * It's possible that this inode has lost all of its links but + * hasn't yet been inactivated. If we don't have a transaction + * or it's not writable, flush the inodegc workers and wait. + * If we have a non-empty transaction, we must not block on + * inodegc, which allocates its own transactions. + */ + if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) + xfs_inodegc_push(mp); + else + xfs_inodegc_flush(mp); + return xchk_iscan_iget_retry(iscan, true); + } + + if (error == -EINVAL) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * We thought the inode was allocated, but the inode btree + * lookup failed, which means that it was freed since the last + * time we advanced the cursor. Back up and try again. This + * should never happen since still hold the AGI buffer from the + * inobt check, but we need to be careful about infinite loops. + */ + return xchk_iscan_iget_retry(iscan, false); + } + + if (error) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return error; + } + idx++; + ino++; + allocmask >>= 1; + + /* + * Now that we've filled the first slot in __inodes, try to fill the + * rest of the batch with consecutively ordered inodes. to reduce the + * number of _iter calls. Make a bitmap of unallocated inodes from the + * zeroes in the inuse bitmap; these inodes will not be scanned, but + * the _want_live_update predicate will pass through all live updates. + * + * If we can't iget an allocated inode, stop and return what we have. + */ + mutex_lock(&iscan->lock); + iscan->__batch_ino = ino - 1; + iscan->__skipped_inomask = 0; + mutex_unlock(&iscan->lock); + + for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) { + if (!(allocmask & 1)) { + ASSERT(!(iscan->__skipped_inomask & (1ULL << i))); + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + iscan->__skipped_inomask |= (1ULL << i); + mutex_unlock(&iscan->lock); + continue; + } + + ASSERT(iscan->__inodes[idx] == NULL); + + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, + &iscan->__inodes[idx]); + if (error) + break; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + mutex_unlock(&iscan->lock); + idx++; + } + + trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return idx; +} + +/* + * Advance the visit cursor to reflect skipped inodes beyond whatever we + * scanned. + */ +STATIC void +xchk_iscan_finish_batch( + struct xchk_iscan *iscan) +{ + xfs_ino_t highest_skipped; + + mutex_lock(&iscan->lock); + + if (iscan->__batch_ino != NULLFSINO) { + highest_skipped = iscan->__batch_ino + + xfs_highbit64(iscan->__skipped_inomask); + iscan->__visited_ino = max(iscan->__visited_ino, + highest_skipped); + + trace_xchk_iscan_skip(iscan); + } + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance the inode scan cursor to the next allocated inode and return up to + * 64 consecutive allocated inodes starting with the cursor position. + */ +STATIC int +xchk_iscan_iter_batch( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + int ret; + + xchk_iscan_finish_batch(iscan); + + if (iscan->iget_timeout) + iscan->__iget_deadline = jiffies + + msecs_to_jiffies(iscan->iget_timeout); + + do { + struct xfs_buf *agi_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_inofree_t allocmask = 0; + uint8_t nr_inodes = 0; + + ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask, + &nr_inodes); + if (ret != 1) + return ret; + + if (xchk_iscan_aborted(iscan)) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + ret = -ECANCELED; + break; + } + + ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * Advance the inode scan cursor to the next allocated inode and return the + * incore inode structure associated with it. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be + * grabbed, or the usual negative errno. + * + * If the function returns -EBUSY and the caller can handle skipping an inode, + * it may call this function again to continue the scan with the next allocated + * inode. + */ +int +xchk_iscan_iter( + struct xchk_iscan *iscan, + struct xfs_inode **ipp) +{ + unsigned int i; + int error; + + /* Find a cached inode, or go get another batch. */ + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) + goto foundit; + } + + error = xchk_iscan_iter_batch(iscan); + if (error <= 0) + return error; + + ASSERT(iscan->__inodes[0] != NULL); + i = 0; + +foundit: + /* Give the caller our reference. */ + *ipp = iscan->__inodes[i]; + iscan->__inodes[i] = NULL; + return 1; +} + +/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */ +void +xchk_iscan_iter_finish( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned int i; + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) { + xchk_irele(sc, iscan->__inodes[i]); + iscan->__inodes[i] = NULL; + } + } +} + +/* Mark this inode scan finished and release resources. */ +void +xchk_iscan_teardown( + struct xchk_iscan *iscan) +{ + xchk_iscan_iter_finish(iscan); + xchk_iscan_finish(iscan); + mutex_destroy(&iscan->lock); +} + +/* Pick an AG from which to start a scan. */ +static inline xfs_ino_t +xchk_iscan_rotor( + struct xfs_mount *mp) +{ + static atomic_t agi_rotor; + unsigned int r = atomic_inc_return(&agi_rotor) - 1; + + /* + * Rotoring *backwards* through the AGs, so we add one here before + * subtracting from the agcount to arrive at an AG number. + */ + r = (r % mp->m_sb.sb_agcount) + 1; + + return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0); +} + +/* + * Set ourselves up to start an inode scan. If the @iget_timeout and + * @iget_retry_delay parameters are set, the scan will try to iget each inode + * for @iget_timeout milliseconds. If an iget call indicates that the inode is + * waiting to be inactivated, the CPU will relax for @iget_retry_delay + * milliseconds after pushing the inactivation workers. + */ +void +xchk_iscan_start( + struct xfs_scrub *sc, + unsigned int iget_timeout, + unsigned int iget_retry_delay, + struct xchk_iscan *iscan) +{ + xfs_ino_t start_ino; + + start_ino = xchk_iscan_rotor(sc->mp); + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + iscan->sc = sc; + clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); + iscan->iget_timeout = iget_timeout; + iscan->iget_retry_delay = iget_retry_delay; + iscan->__visited_ino = start_ino; + iscan->cursor_ino = start_ino; + iscan->scan_start_ino = start_ino; + mutex_init(&iscan->lock); + memset(iscan->__inodes, 0, sizeof(iscan->__inodes)); + + trace_xchk_iscan_start(iscan, start_ino); +} + +/* + * Mark this inode as having been visited. Callers must hold a sufficiently + * exclusive lock on the inode to prevent concurrent modifications. + */ +void +xchk_iscan_mark_visited( + struct xchk_iscan *iscan, + struct xfs_inode *ip) +{ + mutex_lock(&iscan->lock); + iscan->__visited_ino = ip->i_ino; + trace_xchk_iscan_visit(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Did we skip this inode because it wasn't allocated when we loaded the batch? + * If so, it is newly allocated and will not be scanned. All live updates to + * this inode must be passed to the caller to maintain scan correctness. + */ +static inline bool +xchk_iscan_skipped( + const struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + if (iscan->__batch_ino == NULLFSINO) + return false; + if (ino < iscan->__batch_ino) + return false; + if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK) + return false; + + return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino)); +} + +/* + * Do we need a live update for this inode? This is true if the scanner thread + * has visited this inode and the scan hasn't been aborted due to errors. + * Callers must hold a sufficiently exclusive lock on the inode to prevent + * scanners from reading any inode metadata. + */ +bool +xchk_iscan_want_live_update( + struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + bool ret = false; + + if (xchk_iscan_aborted(iscan)) + return false; + + mutex_lock(&iscan->lock); + + trace_xchk_iscan_want_live_update(iscan, ino); + + /* Scan is finished, caller should receive all updates. */ + if (iscan->__visited_ino == NULLFSINO) { + ret = true; + goto unlock; + } + + /* + * No inodes have been visited yet, so the visited cursor points at the + * start of the scan range. The caller should not receive any updates. + */ + if (iscan->scan_start_ino == iscan->__visited_ino) { + ret = false; + goto unlock; + } + + /* + * This inode was not allocated at the time of the iscan batch. + * The caller should receive all updates. + */ + if (xchk_iscan_skipped(iscan, ino)) { + ret = true; + goto unlock; + } + + /* + * The visited cursor hasn't yet wrapped around the end of the FS. If + * @ino is inside the starred range, the caller should receive updates: + * + * 0 ------------ S ************ V ------------ EOFS + */ + if (iscan->scan_start_ino <= iscan->__visited_ino) { + if (ino >= iscan->scan_start_ino && + ino <= iscan->__visited_ino) + ret = true; + + goto unlock; + } + + /* + * The visited cursor wrapped around the end of the FS. If @ino is + * inside the starred range, the caller should receive updates: + * + * 0 ************ V ------------ S ************ EOFS + */ + if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino) + ret = true; + +unlock: + mutex_unlock(&iscan->lock); + return ret; +} diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h new file mode 100644 index 000000000000..f9f47fa01a9e --- /dev/null +++ b/fs/xfs/scrub/iscan.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ISCAN_H__ +#define __XFS_SCRUB_ISCAN_H__ + +struct xchk_iscan { + struct xfs_scrub *sc; + + /* Lock to protect the scan cursor. */ + struct mutex lock; + + /* + * This is the first inode in the inumber address space that we + * examined. When the scan wraps around back to here, the scan is + * finished. + */ + xfs_ino_t scan_start_ino; + + /* This is the inode that will be examined next. */ + xfs_ino_t cursor_ino; + + /* If nonzero and non-NULL, skip this inode when scanning. */ + xfs_ino_t skip_ino; + + /* + * This is the last inode that we've successfully scanned, either + * because the caller scanned it, or we moved the cursor past an empty + * part of the inode address space. Scan callers should only use the + * xchk_iscan_visit function to modify this. + */ + xfs_ino_t __visited_ino; + + /* Operational state of the livescan. */ + unsigned long __opstate; + + /* Give up on iterating @cursor_ino if we can't iget it by this time. */ + unsigned long __iget_deadline; + + /* Amount of time (in ms) that we will try to iget an inode. */ + unsigned int iget_timeout; + + /* Wait this many ms to retry an iget. */ + unsigned int iget_retry_delay; + + /* + * The scan grabs batches of inodes and stashes them here before + * handing them out with _iter. Unallocated inodes are set in the + * mask so that all updates to that inode are selected for live + * update propagation. + */ + xfs_ino_t __batch_ino; + xfs_inofree_t __skipped_inomask; + struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK]; +}; + +/* Set if the scan has been aborted due to some event in the fs. */ +#define XCHK_ISCAN_OPSTATE_ABORTED (1) + +/* Use trylock to acquire the AGI */ +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) + +static inline bool +xchk_iscan_aborted(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +static inline void +xchk_iscan_abort(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +static inline bool +xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + +static inline void +xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + +void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, + unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_finish_early(struct xchk_iscan *iscan); +void xchk_iscan_teardown(struct xchk_iscan *iscan); + +int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); +void xchk_iscan_iter_finish(struct xchk_iscan *iscan); + +void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip); +bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino); + +#endif /* __XFS_SCRUB_ISCAN_H__ */ diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c new file mode 100644 index 000000000000..256ff7700c94 --- /dev/null +++ b/fs/xfs/scrub/listxattr.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_trans.h" +#include "scrub/scrub.h" +#include "scrub/bitmap.h" +#include "scrub/dab_bitmap.h" +#include "scrub/listxattr.h" + +/* Call a function for every entry in a shortform xattr structure. */ +STATIC int +xchk_xattr_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data; + struct xfs_attr_sf_entry *sfe; + unsigned int i; + int error; + + sfe = xfs_attr_sf_firstentry(hdr); + for (i = 0; i < hdr->count; i++) { + error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen, + &sfe->nameval[sfe->namelen], sfe->valuelen, + priv); + if (error) + return error; + + sfe = xfs_attr_sf_nextentry(sfe); + } + + return 0; +} + +/* Call a function for every entry in this xattr leaf block. */ +STATIC int +xchk_xattr_walk_leaf_entries( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + struct xfs_buf *bp, + void *priv) +{ + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; + unsigned int i; + int error; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + for (i = 0; i < ichdr.count; entry++, i++) { + void *value; + unsigned char *name; + unsigned int namelen, valuelen; + + if (entry->flags & XFS_ATTR_LOCAL) { + struct xfs_attr_leaf_name_local *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + value = &name_loc->nameval[name_loc->namelen]; + valuelen = be16_to_cpu(name_loc->valuelen); + } else { + struct xfs_attr_leaf_name_remote *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + value = NULL; + valuelen = be32_to_cpu(name_rmt->valuelen); + } + + error = attr_fn(sc, ip, entry->flags, name, namelen, value, + valuelen, priv); + if (error) + return error; + + } + + return 0; +} + +/* + * Call a function for every entry in a leaf-format xattr structure. Avoid + * memory allocations for the loop detector since there's only one block. + */ +STATIC int +xchk_xattr_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_buf *leaf_bp; + int error; + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp); + if (error) + return error; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv); + xfs_trans_brelse(sc->tp, leaf_bp); + return error; +} + +/* Find the leftmost leaf in the xattr dabtree. */ +STATIC int +xchk_xattr_find_leftmost_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + struct xdab_bitmap *seen_dablks, + struct xfs_buf **leaf_bpp) +{ + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_buf *bp; + xfs_failaddr_t fa; + xfs_dablk_t blkno = 0; + unsigned int expected_level = 0; + int error; + + for (;;) { + xfs_extlen_t len = 1; + uint16_t magic; + + /* Make sure we haven't seen this new block already. */ + if (xdab_bitmap_test(seen_dablks, blkno, &len)) + return -EFSCORRUPTED; + + error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + + error = -EFSCORRUPTED; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) + goto out_buf; + + fa = xfs_da3_node_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); + + if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + goto out_buf; + + /* Check the level from the root node. */ + if (blkno == 0) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + goto out_buf; + else + expected_level--; + + /* Remember that we've seen this node. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + /* Find the next level towards the leaves of the dabtree. */ + btree = nodehdr.btree; + blkno = be32_to_cpu(btree->before); + xfs_trans_brelse(tp, bp); + } + + error = -EFSCORRUPTED; + fa = xfs_attr3_leaf_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + if (expected_level != 0) + goto out_buf; + + /* Remember that we've seen this leaf. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + *leaf_bpp = bp; + return 0; + +out_buf: + xfs_trans_brelse(tp, bp); + return error; +} + +/* Call a function for every entry in a node-format xattr structure. */ +STATIC int +xchk_xattr_walk_node( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xdab_bitmap seen_dablks; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_buf *leaf_bp; + int error; + + xdab_bitmap_init(&seen_dablks); + + error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp); + if (error) + goto out_bitmap; + + for (;;) { + xfs_extlen_t len; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, + priv); + if (error) + goto out_leaf; + + /* Find the right sibling of this leaf block. */ + leaf = leaf_bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + if (leafhdr.forw == 0) + goto out_leaf; + + xfs_trans_brelse(sc->tp, leaf_bp); + + if (leaf_fn) { + error = leaf_fn(sc, priv); + if (error) + goto out_bitmap; + } + + /* Make sure we haven't seen this new leaf already. */ + len = 1; + if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) { + error = -EFSCORRUPTED; + goto out_bitmap; + } + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, + leafhdr.forw, &leaf_bp); + if (error) + goto out_bitmap; + + /* Remember that we've seen this new leaf. */ + error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1); + if (error) + goto out_leaf; + } + +out_leaf: + xfs_trans_brelse(sc->tp, leaf_bp); +out_bitmap: + xdab_bitmap_destroy(&seen_dablks); + return error; +} + +/* + * Call a function for every extended attribute in a file. + * + * Callers must hold the ILOCK. No validation or cursor restarts allowed. + * Returns -EFSCORRUPTED on any problem, including loops in the dabtree. + */ +int +xchk_xattr_walk( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + if (!xfs_inode_hasattr(ip)) + return 0; + + if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_xattr_walk_sf(sc, ip, attr_fn, priv); + + /* attr functions require that the attr fork is loaded */ + error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + if (xfs_attr_is_leaf(ip)) + return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv); + + return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv); +} diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h new file mode 100644 index 000000000000..703cfb7b14cf --- /dev/null +++ b/fs/xfs/scrub/listxattr.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_LISTXATTR_H__ +#define __XFS_SCRUB_LISTXATTR_H__ + +typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int attr_flags, const unsigned char *name, + unsigned int namelen, const void *value, unsigned int valuelen, + void *priv); + +typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv); + +int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip, + xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv); + +#endif /* __XFS_SCRUB_LISTXATTR_H__ */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 000000000000..2aa14b7ab630 --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,568 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (0) If someone turned one of the debug knobs. + * (1) If this is a per-AG btree and the AG has less than 10% space free. + * (2) If this is an inode btree and the FS has less than 10% space free. + + * In either case, format the new btree blocks almost completely full to + * minimize space usage. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* + * The xfs_globals values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = xfs_globals.bload_leaf_slack; + bload->node_slack = xfs_globals.bload_node_slack; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 10% free space left. */ + if (free >= div_u64(sz, 10)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy after a mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + int error; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + if (args->tp) { + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, + XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); + if (error) + goto out_pag; + } + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; +} + +/* + * Add an extent to the new btree reservation pool. Callers are required to + * reap this reservation manually if the repair is cancelled. @pag must be a + * passive reference. + */ +int +xrep_newbt_add_extent( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xfs_alloc_arg args = { + .tp = NULL, /* no autoreap */ + .oinfo = xnr->oinfo, + .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .len = len, + .resv = xnr->resv, + }; + + return xrep_newbt_add_blocks(xnr, pag, &args); +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_near_bno(&args, + xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_start_ag(&args, + xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, let the existing EFI free the entire + * space extent. + */ + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; + } + + /* + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. + */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); + + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations and try to commit to freeing the space we used. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); +} + +/* How many reserved blocks are unused? */ +unsigned int +xrep_newbt_unused_blocks( + struct xrep_newbt *xnr) +{ + struct xrep_newbt_resv *resv; + unsigned int unused = 0; + + list_for_each_entry(resv, &xnr->resv_list, list) + unused += resv->len - resv->used; + return unused; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 000000000000..3d804d31af24 --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xfs_alloc_arg; + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* Custom allocation function, or NULL for xfs_alloc_vextent */ + int (*alloc_vextent)(struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint); + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); +unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 000000000000..80aee30886c4 --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,1049 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/orphanage.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + if (xchk_could_repair(sc)) { + error = xrep_setup_nlinks(sc); + if (error) + return error; + } + + xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!xnc) + return -ENOMEM; + xnc->xname.name = xnc->namebuf; + xnc->sc = sc; + sc->buf = xnc; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting file link counts. For each file, we create a shadow link + * counting structure, then walk the entire directory tree, incrementing parent + * and child link counts for each directory entry seen. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all + * directory entry updates because that is when link count updates occur. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. Because the hook code runs in a different process context from the + * scrub code and the scrub state flags are not accessed atomically, failures + * in the hook code must abort the iscan and the scrubber must notice the + * aborted scan and set the incomplete flag. + * + * Note that we use jump labels and srcu notifier hooks to minimize the + * overhead when live nlinks is /not/ running. Locking order for nlink + * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock. + */ + +/* + * Add a delta to an nlink counter, clamping the value to U32_MAX. Because + * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results + * even if we lose some precision. + */ +static inline void +careful_add( + xfs_nlink_t *nlinkp, + int delta) +{ + uint64_t new_value = (uint64_t)(*nlinkp) + delta; + + BUILD_BUG_ON(XFS_MAXLINK > U32_MAX); + *nlinkp = min_t(uint64_t, new_value, U32_MAX); +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + int parents_delta, + int backrefs_delta, + int children_delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta, + backrefs_delta, children_delta); + + careful_add(&nl.parents, parents_delta); + careful_add(&nl.backrefs, backrefs_delta); + careful_add(&nl.children, children_delta); + + nl.flags |= XCHK_NLINK_WRITTEN; + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure based on a directory update in progress. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_nlink_ctrs *xnc; + int error; + + xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(p->dp)) + return NOTIFY_DONE; + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, + p->delta, p->name->name, p->name->len); + + /* + * If we've already scanned @dp, update the number of parents that link + * to @ip. If @ip is a subdirectory, update the number of child links + * going out of @dp. + */ + if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta, + 0, 0); + if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode)) + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + 0, p->delta); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + /* + * If @ip is a subdirectory and we've already scanned it, update the + * number of backrefs pointing to @dp. + */ + if (S_ISDIR(VFS_IC(p->ip)->i_mode) && + xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + p->delta, 0); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); + return NOTIFY_DONE; +} + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_collect_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + bool dot = false, dotdot = false; + int error; + + /* Does this name make sense? */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) { + error = -ECANCELED; + goto out_abort; + } + + if (name->len == 1 && name->name[0] == '.') + dot = true; + else if (name->len == 2 && name->name[0] == '.' && + name->name[1] == '.') + dotdot = true; + + /* Don't accept a '.' entry that points somewhere else. */ + if (dot && ino != dp->i_ino) { + error = -ECANCELED; + goto out_abort; + } + + /* Don't accept an invalid inode number. */ + if (!xfs_verify_dir_ino(sc->mp, ino)) { + error = -ECANCELED; + goto out_abort; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name); + + mutex_lock(&xnc->lock); + + /* + * If this is a dotdot entry, it is a back link from dp to ino. How + * we handle this depends on whether or not dp is the root directory. + * + * The root directory is its own parent, so we pretend the dotdot entry + * establishes the "parent" of the root directory. Increment the + * number of parents of the root directory. + * + * Otherwise, increment the number of backrefs pointing back to ino. + * + * If the filesystem has parent pointers, we walk the pptrs to + * determine the backref count. + */ + if (dotdot) { + if (dp == sc->mp->m_rootip) + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + else if (!xfs_has_parent(sc->mp)) + error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + else + error = 0; + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link from dp to ino, increment the + * number of parents linking into ino. + */ + if (!dot && !dotdot) { + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link to a subdirectory, increment the + * number of child links of dp. + */ + if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) { + error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1); + if (error) + goto out_unlock; + } + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + +/* Bump the backref count for the inode referenced by this parent pointer. */ +STATIC int +xchk_nlinks_collect_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_nlink_ctrs *xnc = priv; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + int error; + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec); + + mutex_lock(&xnc->lock); + + error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0); + if (error) + goto out_unlock; + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + +/* Walk a directory to bump the observed link counts of the children. */ +STATIC int +xchk_nlinks_collect_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + unsigned int lock_mode; + int error = 0; + + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(dp)) + return 0; + + /* Prevent anyone from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out_unlock; + + /* + * We cannot count file links if the directory looks as though it has + * been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_abort; + } + + error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + + /* Walk the parent pointers to get real backref counts. */ + if (xfs_has_parent(sc->mp)) { + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL, + xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + } + + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); + goto out_unlock; + +out_abort: + xchk_set_incomplete(sc); + xchk_iscan_abort(&xnc->collect_iscan); +out_unlock: + xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_collect_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_collect_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this non-directory file. */ +static inline int +xchk_nlinks_collect_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + int error; + + /* Count the rt and quota files that are rooted in the superblock. */ + error = xchk_nlinks_collect_metafiles(xnc); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_collect_dir(xnc, ip); + else + error = xchk_nlinks_collect_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->collect_iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing file link counters. Walk each inode and compare the link + * counts against our shadow information; and then walk each shadow link count + * structure (that wasn't covered in the first part), comparing it against the + * file. + */ + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN); + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * shouldn't really happen outside of the collection phase. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + /* Copy the counters, but do not expose the internal state. */ + obs->parents = nl.parents; + obs->backrefs = nl.backrefs; + obs->children = nl.children; + obs->flags = 0; + return 0; +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + unsigned int actual_nlink; + int error; + + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * If we don't have ftype to get an accurate count of the subdirectory + * entries in this directory, take advantage of the fact that on a + * consistent ftype=0 filesystem, the number of subdirectory + * backreferences (dotdot entries) pointing towards this directory + * should be equal to the number of subdirectory entries in the + * directory. + */ + if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode)) + obs.children = obs.backrefs; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* + * If we found so many parents that we'd overflow i_nlink, we must flag + * this as a corruption. The VFS won't let users increase the link + * count, but it will let them decrease it. + */ + if (total_links > XFS_NLINK_PINNED) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } else if (total_links > XFS_MAXLINK) { + xchk_ino_set_warning(sc, ip->i_ino); + } + + /* Link counts should match. */ + if (total_links != actual_nlink) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) { + /* + * The collection phase ignores directories with zero link + * count, so we ignore them here too. + * + * The number of subdirectory backreferences (dotdot entries) + * pointing towards this directory should be equal to the + * number of subdirectory entries in the directory. + */ + if (obs.children != obs.backrefs) + xchk_ino_xref_set_corrupt(sc, ip->i_ino); + } else { + /* + * Non-directories and unlinked directories should not have + * back references. + */ + if (obs.backrefs != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* + * Non-directories and unlinked directories should not have + * children. + */ + if (obs.children != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + + if (ip == sc->mp->m_rootip) { + /* + * For the root of a directory tree, both the '.' and '..' + * entries should point to the root directory. The dotdot + * entry is counted as a parent of the root /and/ a backref of + * the root directory. + */ + if (obs.parents != 1) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } else if (actual_nlink > 0) { + /* + * Linked files that are not the root directory should have at + * least one parent. + */ + if (obs.parents == 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + +out_corrupt: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -ECANCELED; +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * The first iget failed, so try again with the variant that returns + * either an incore inode or the AGI buffer. If the function returns + * EINVAL/ENOENT, it should have passed us the AGI buffer so that we + * can guarantee that the inode won't be allocated while we check for + * a zero link count in the observed link count data. + */ + error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip); + if (!error) { + /* Actually got an inode, so use the inode compare. */ + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found. Check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(NULL, &obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + if (agi_bp) + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* + * Try to visit every inode in the filesystem to compare the link count. Move + * on if we can't grab an inode, since we'll revisit unchecked nlink records in + * the second part. + */ +static int +xchk_nlinks_compare_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. Inodes skipped by _compare_iter will be tried again in the + * next phase of the scan. + */ + xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan); + while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) { + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) { + xfs_ino_t ino = cur - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook); + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_teardown(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned long long max_inos; + xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1; + xfs_agino_t first_agino, last_agino; + int error; + + mutex_init(&xnc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan); + + /* + * Set up enough space to store an nlink record for the highest + * possible inode number in this system. + */ + xfs_agino_range(mp, last_agno, &first_agino, &last_agino); + max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1; + descr = xchk_xfile_descr(sc, "file link counts"); + error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos), + sizeof(struct xchk_nlink), &xnc->nlinks); + kfree(descr); + if (error) + goto out_teardown; + + /* + * Hook into the directory entry code so that we can capture updates to + * file link counts. The hook only triggers for inodes that were + * already scanned, and the scanner thread takes each inode's ILOCK, + * which means that any in-progress inode updates will finish before we + * can scan the inode. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update); + error = xfs_dir_hook_add(mp, &xnc->dhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 000000000000..b820712bfd87 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* + * In-core link counts for a given inode in the filesystem. + * + * For an empty rootdir, the directory entries and the field to which they are + * accounted are as follows: + * + * Root directory: + * + * . points to self (root.child) + * .. points to self (root.parent) + * f1 points to a child file (f1.parent) + * d1 points to a child dir (d1.parent, root.child) + * + * Subdirectory d1: + * + * . points to self (d1.child) + * .. points to root dir (root.backref) + * f2 points to child file (f2.parent) + * f3 points to root.f1 (f1.parent) + * + * root.nlink == 3 (root.dot, root.dotdot, root.d1) + * d1.nlink == 2 (root.d1, d1.dot) + * f1.nlink == 2 (root.f1, d1.f3) + * f2.nlink == 1 (d1.f2) + */ +struct xchk_nlink { + /* Count of forward links from parent directories to this file. */ + xfs_nlink_t parents; + + /* + * Count of back links to this parent directory from child + * subdirectories. + */ + xfs_nlink_t backrefs; + + /* + * Count of forward links from this directory to all child files and + * the number of dot entries. Should be zero for non-directories. + */ + xfs_nlink_t children; + + /* Record state flags */ + unsigned int flags; +}; + +/* + * This incore link count has been written at least once. We never want to + * store an xchk_nlink that looks uninitialized. + */ +#define XCHK_NLINK_WRITTEN (1U << 0) + +/* Already checked this link count record. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 1) + +/* Already made a repair with this link count record. */ +#define XREP_NLINK_DIRTY (1U << 2) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live) +{ + uint64_t ret = live->parents; + + /* Add one link count for the dot entry of any linked directory. */ + if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink) + ret++; + return ret + live->children; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c new file mode 100644 index 000000000000..b3e707f47b7b --- /dev/null +++ b/fs/xfs/scrub/nlinks_repair.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/orphanage.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" +#include "scrub/tempfile.h" + +/* + * Live Inode Link Count Repair + * ============================ + * + * Use the live inode link count information that we collected to replace the + * nlink values of the incore inodes. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * inode is locked. + */ + +/* Set up to repair inode link counts. */ +int +xrep_setup_nlinks( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* + * Inodes that aren't the root directory or the orphanage, have a nonzero link + * count, and no observed parents should be moved to the orphanage. + */ +static inline bool +xrep_nlinks_is_orphaned( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int actual_nlink, + const struct xchk_nlink *obs) +{ + struct xfs_mount *mp = ip->i_mount; + + if (obs->parents != 0) + return false; + if (ip == mp->m_rootip || ip == sc->orphanage) + return false; + return actual_nlink != 0; +} + +/* Remove an inode from the unlinked list. */ +STATIC int +xrep_nlinks_iunlink_remove( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + return error; +} + +/* + * Correct the link count of the given inode. Because we have to grab locks + * and resources in a certain order, it's possible that this will be a no-op. + */ +STATIC int +xrep_nlinks_repair_inode( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + uint64_t total_links; + uint64_t actual_nlink; + bool orphanage_available = false; + bool dirty = false; + int error; + + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; + + /* + * If the filesystem has an orphanage attached to the scrub context, + * prepare for a link count repair that could involve @ip being adopted + * by the lost+found. + */ + if (xrep_orphanage_can_adopt(sc)) { + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + + error = xrep_adoption_trans_alloc(sc, &xnc->adoption); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } else { + orphanage_available = true; + } + } + + /* + * Either there is no orphanage or we couldn't allocate resources for + * that kind of update. Let's try again with only the resources we + * need for a simple link count update, since that's much more common. + */ + if (!orphanage_available) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, + &sc->tp); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return error; + } + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + } + + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_scanlock; + } + + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * We're done accessing the shared scan data, so we can drop the lock. + * We still hold @ip's ILOCK, so its link count cannot change. + */ + mutex_unlock(&xnc->lock); + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + /* + * Non-directories cannot have directories pointing up to them. + * + * We previously set error to zero, but set it again because one static + * checker author fears that programmers will fail to maintain this + * invariant and built their tool to flag this as a security risk. A + * different tool author made their bot complain about the redundant + * store. This is a never-ending and stupid battle; both tools missed + * *actual bugs* elsewhere; and I no longer care. + */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + error = 0; + goto out_trans; + } + + /* + * Decide if we're going to move this file to the orphanage, and fix + * up the incore link counts if we are. + */ + if (orphanage_available && + xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) { + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname); + if (error) + goto out_trans; + + /* + * Reattach this file to the directory tree by moving it to + * the orphanage per the adoption parameters that we already + * computed. + */ + error = xrep_adoption_move(&xnc->adoption); + if (error) + goto out_trans; + + /* + * Re-read the link counts since the reparenting will have + * updated our scan info. + */ + mutex_lock(&xnc->lock); + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + mutex_unlock(&xnc->lock); + if (error) + goto out_trans; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + dirty = true; + } + + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) { + error = xrep_nlinks_iunlink_remove(sc); + if (error) + goto out_trans; + dirty = true; + } + + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) { + error = xfs_iunlink(sc->tp, ip); + if (error) + goto out_trans; + dirty = true; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { + trace_xrep_nlinks_update_inode(mp, ip, &obs); + + set_nlink(VFS_I(ip), min_t(unsigned long long, total_links, + XFS_NLINK_PINNED)); + dirty = true; + } + + if (!dirty) { + error = 0; + goto out_trans; + } + + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + + error = xrep_trans_commit(sc); + goto out_unlock; + +out_scanlock: + mutex_unlock(&xnc->lock); +out_trans: + xchk_trans_cancel(sc); +out_unlock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (orphanage_available) { + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Try to visit every inode in the filesystem for repairs. Move on if we can't + * grab an inode, since we're still making forward progress. + */ +static int +xrep_nlinks_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Commit the new inode link counters. */ +int +xrep_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error; + + /* + * We need ftype for an accurate count of the number of child + * subdirectory links. Child subdirectories with a back link (dotdot + * entry) but no forward link are moved to the orphanage, so we cannot + * repair the link count of the parent directory based on the back link + * count alone. Filesystems without ftype support are rare (old V4) so + * we just skip out here. + */ + if (!xfs_has_ftype(sc->mp)) + return -EOPNOTSUPP; + + /* + * Use the inobt to walk all allocated inodes to compare and fix the + * link counts. Retry iget every tenth of a second for up to 30 + * seconds -- even if repair misses a few inodes, we still try to fix + * as many of them as we can. + */ + xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan); + ASSERT(sc->ip == NULL); + + while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) { + /* + * Commit the scrub transaction so that we can create repair + * transactions with the correct reservations. + */ + xchk_trans_cancel(sc); + + error = xrep_nlinks_repair_inode(xnc); + xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip); + xchk_irele(sc, sc->ip); + sc->ip = NULL; + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + + /* + * Create a new empty transaction so that we can advance the + * iscan cursor without deadlocking if the inobt has a cycle. + * We can only push the inactivation workqueues with an empty + * transaction. + */ + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + + return error; +} diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h new file mode 100644 index 000000000000..0d3f9e6c1aad --- /dev/null +++ b/fs/xfs/scrub/off_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_OFF_BITMAP_H__ +#define __XFS_SCRUB_OFF_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fileoff_t */ + +struct xoff_bitmap { + struct xbitmap64 offbitmap; +}; + +static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->offbitmap); +} + +static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->offbitmap); +} + +static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap, + xfs_fileoff_t off, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->offbitmap, off, len); +} + +static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->offbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */ diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c new file mode 100644 index 000000000000..7148d8362db8 --- /dev/null +++ b/fs/xfs/scrub/orphanage.c @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_parent.h" +#include "xfs_attr_sf.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/orphanage.h" +#include "scrub/readdir.h" + +#include <linux/namei.h> + +/* + * The Orphanage + * ============= + * + * If the directory tree is damaged, children of that directory become + * inaccessible via that file path. If a child has no other parents, the file + * is said to be orphaned. xfs_repair fixes this situation by creating a + * orphanage directory (specifically, /lost+found) and creating a directory + * entry pointing to the orphaned file. + * + * Online repair follows this tactic by creating a root-owned /lost+found + * directory if one does not exist. If an orphan is found, it will move that + * files into orphanage. + */ + +/* Make the orphanage owned by root. */ +STATIC int +xrep_chown_orphanage( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = sc->mp; + struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL; + struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL; + struct inode *inode = VFS_I(dp); + int error; + + error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + if (error) + return error; + + error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp); + if (error) + goto out_dqrele; + + /* + * Always clear setuid/setgid/sticky on the orphanage since we don't + * normally want that functionality on this directory and xfs_repair + * doesn't create it this way either. Leave the other access bits + * unchanged. + */ + inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) { + if (XFS_IS_UQUOTA_ON(mp)) + oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp); + inode->i_uid = GLOBAL_ROOT_UID; + } + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) { + if (XFS_IS_GQUOTA_ON(mp)) + oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp); + inode->i_gid = GLOBAL_ROOT_GID; + } + if (dp->i_projid != 0) { + if (XFS_IS_PQUOTA_ON(mp)) + oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp); + dp->i_projid = 0; + } + + dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); + + if (xfs_has_wsync(mp)) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + + xfs_qm_dqrele(oldu); + xfs_qm_dqrele(oldg); + xfs_qm_dqrele(oldp); + +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + return error; +} + +#define ORPHANAGE "lost+found" + +/* Create the orphanage directory, and set sc->orphanage to it. */ +int +xrep_orphanage_create( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct dentry *root_dentry, *orphanage_dentry; + struct inode *root_inode = VFS_I(sc->mp->m_rootip); + struct inode *orphanage_inode; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) { + sc->orphanage = NULL; + return 0; + } + + ASSERT(sc->tp == NULL); + ASSERT(sc->orphanage == NULL); + + /* Find the dentry for the root directory... */ + root_dentry = d_find_alias(root_inode); + if (!root_dentry) { + error = -EFSCORRUPTED; + goto out; + } + + /* ...which is a directory, right? */ + if (!d_is_dir(root_dentry)) { + error = -EFSCORRUPTED; + goto out_dput_root; + } + + /* Try to find the orphanage directory. */ + inode_lock_nested(root_inode, I_MUTEX_PARENT); + orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, + strlen(ORPHANAGE)); + if (IS_ERR(orphanage_dentry)) { + error = PTR_ERR(orphanage_dentry); + goto out_unlock_root; + } + + /* + * Nothing found? Call mkdir to create the orphanage. Create the + * directory without other-user access because we're live and someone + * could have been relying partly on minimal access to a parent + * directory to control access to a file we put in here. + */ + if (d_really_is_negative(orphanage_dentry)) { + error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, + 0750); + if (error) + goto out_dput_orphanage; + } + + /* Not a directory? Bail out. */ + if (!d_is_dir(orphanage_dentry)) { + error = -ENOTDIR; + goto out_dput_orphanage; + } + + /* + * Grab a reference to the orphanage. This /should/ succeed since + * we hold the root directory locked and therefore nobody can delete + * the orphanage. + */ + orphanage_inode = igrab(d_inode(orphanage_dentry)); + if (!orphanage_inode) { + error = -ENOENT; + goto out_dput_orphanage; + } + + /* Make sure the orphanage is owned by root. */ + error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode)); + if (error) + goto out_dput_orphanage; + + /* Stash the reference for later and bail out. */ + sc->orphanage = XFS_I(orphanage_inode); + sc->orphanage_ilock_flags = 0; + +out_dput_orphanage: + dput(orphanage_dentry); +out_unlock_root: + inode_unlock(VFS_I(sc->mp->m_rootip)); +out_dput_root: + dput(root_dentry); +out: + return error; +} + +void +xrep_orphanage_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->orphanage_ilock_flags |= ilock_flags; + xfs_ilock(sc->orphanage, ilock_flags); +} + +bool +xrep_orphanage_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) { + sc->orphanage_ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xrep_orphanage_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_iunlock(sc->orphanage, ilock_flags); + sc->orphanage_ilock_flags &= ~ilock_flags; +} + +/* Grab the IOLOCK of the orphanage and sc->ip. */ +int +xrep_orphanage_iolock_two( + struct xfs_scrub *sc) +{ + int error = 0; + + while (true) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Normal XFS takes the IOLOCK before grabbing a transaction. + * Scrub holds a transaction, which means that we can't block + * on either IOLOCK. + */ + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + delay(1); + } + + return 0; +} + +/* Release the orphanage. */ +void +xrep_orphanage_rele( + struct xfs_scrub *sc) +{ + if (!sc->orphanage) + return; + + if (sc->orphanage_ilock_flags) + xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags); + + xchk_irele(sc, sc->orphanage); + sc->orphanage = NULL; +} + +/* Adoption moves a file into /lost+found */ + +/* Can the orphanage adopt @sc->ip? */ +bool +xrep_orphanage_can_adopt( + struct xfs_scrub *sc) +{ + ASSERT(sc->ip != NULL); + + if (!sc->orphanage) + return false; + if (sc->ip == sc->orphanage) + return false; + if (xfs_internal_inum(sc->mp, sc->ip->i_ino)) + return false; + return true; +} + +/* + * Create a new transaction to send a child to the orphanage. + * + * Allocate a new transaction with sufficient disk space to handle the + * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the + * transaction, and reserve quota to reparent the latter. Caller must hold the + * IOLOCK of the orphanage and sc->ip. + */ +int +xrep_adoption_trans_alloc( + struct xfs_scrub *sc, + struct xrep_adoption *adopt) +{ + struct xfs_mount *mp = sc->mp; + unsigned int child_blkres = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(sc->ip != NULL); + ASSERT(sc->orphanage != NULL); + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + ASSERT(!(sc->orphanage_ilock_flags & + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + + /* Compute the worst case space reservation that we need. */ + adopt->sc = sc; + adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN); + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + child_blkres = xfs_rename_space_res(mp, 0, false, + xfs_name_dotdot.len, false); + if (xfs_has_parent(mp)) + child_blkres += XFS_ADDAFORK_SPACE_RES(mp); + adopt->child_blkres = child_blkres; + + /* + * Allocate a transaction to link the child into the parent, along with + * enough disk space to handle expansion of both the orphanage and the + * dotdot entry of a child directory. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, + adopt->orphanage_blkres + adopt->child_blkres, 0, 0, + &sc->tp); + if (error) + return error; + + xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL, + sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL; + + xfs_trans_ijoin(sc->tp, sc->orphanage, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* + * Reserve enough quota in the orphan directory to add the new name. + * Normally the orphanage should have user/group/project ids of zero + * and hence is not subject to quota enforcement, but we're allowed to + * exceed quota to reattach disconnected parts of the directory tree. + */ + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage, + adopt->orphanage_blkres, 0, true); + if (error) + goto out_cancel; + + /* + * Reserve enough quota in the child directory to change dotdot. + * Here we're also allowed to exceed file quota to repair inconsistent + * metadata. + */ + if (adopt->child_blkres) { + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip, + adopt->child_blkres, 0, true); + if (error) + goto out_cancel; + } + + return 0; +out_cancel: + xchk_trans_cancel(sc); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return error; +} + +/* + * Compute the xfs_name for the directory entry that we're adding to the + * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not + * reuse namebuf until the adoption completes or is dissolved. + */ +int +xrep_adoption_compute_name( + struct xrep_adoption *adopt, + struct xfs_name *xname) +{ + struct xfs_scrub *sc = adopt->sc; + char *namebuf = (void *)xname->name; + xfs_ino_t ino; + unsigned int incr = 0; + int error = 0; + + adopt->xname = xname; + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino); + xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode); + + /* Make sure the filename is unique in the lost+found. */ + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + while (error == 0 && incr < 10000) { + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u", + sc->ip->i_ino, ++incr); + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + } + if (error == 0) { + /* We already have 10,000 entries in the orphanage? */ + return -EFSCORRUPTED; + } + + if (error != -ENOENT) + return error; + return 0; +} + +/* + * Make sure the dcache does not have a positive dentry for the name we've + * chosen. The caller should have checked with the ondisk directory, so any + * discrepancy is a sign that something is seriously wrong. + */ +static int +xrep_adoption_check_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + int error = 0; + + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return 0; + + d_child = d_hash_and_lookup(d_orphanage, &qname); + if (d_child) { + trace_xrep_adoption_check_child(sc->mp, d_child); + + if (d_is_positive(d_child)) { + ASSERT(d_is_negative(d_child)); + error = -EFSCORRUPTED; + } + + dput(d_child); + } + + dput(d_orphanage); + return error; +} + +/* + * Invalidate all dentries for the name that was added to the orphanage + * directory, and all dentries pointing to the child inode that was moved. + * + * There should not be any positive entries for the name, since we've + * maintained our lock on the orphanage directory. + */ +static void +xrep_adoption_zap_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + + /* Invalidate all dentries for the adoption name */ + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return; + + d_child = d_hash_and_lookup(d_orphanage, &qname); + while (d_child != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + ASSERT(d_is_negative(d_child)); + d_invalidate(d_child); + dput(d_child); + d_child = d_lookup(d_orphanage, &qname); + } + + dput(d_orphanage); + + /* Invalidate all the dentries pointing down to this file. */ + while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + d_invalidate(d_child); + dput(d_child); + } +} + +/* + * If we have to add an attr fork ahead of a parent pointer update, how much + * space should we ask for? + */ +static inline int +xrep_adoption_attr_sizeof( + const struct xrep_adoption *adopt) +{ + return sizeof(struct xfs_attr_sf_hdr) + + xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec), + adopt->xname->len); +} + +/* + * Move the current file to the orphanage under the computed name. + * + * Returns with a dirty transaction so that the caller can handle any other + * work, such as fixing up unlinked lists or resetting link counts. + */ +int +xrep_adoption_move( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode); + int error; + + trace_xrep_adoption_reparent(sc->orphanage, adopt->xname, + sc->ip->i_ino); + + error = xrep_adoption_check_dcache(adopt); + if (error) + return error; + + /* + * If this filesystem has parent pointers, ensure that the file being + * moved to the orphanage has an attribute fork. This is required + * because the parent pointer code does not itself add attr forks. + */ + if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) { + int sf_size = xrep_adoption_attr_sizeof(adopt); + + error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true); + if (error) + return error; + } + + /* Create the new name in the orphanage. */ + error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname, + sc->ip->i_ino, adopt->orphanage_blkres); + if (error) + return error; + + /* + * Bump the link count of the orphanage if we just added a + * subdirectory, and update its timestamps. + */ + xfs_trans_ichgtime(sc->tp, sc->orphanage, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (isdir) + xfs_bumplink(sc->tp, sc->orphanage); + xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE); + + /* Bump the link count of the child. */ + if (adopt->bump_child_nlink) { + xfs_bumplink(sc->tp, sc->ip); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Replace the dotdot entry if the child is a subdirectory. */ + if (isdir) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + sc->orphanage->i_ino, adopt->child_blkres); + if (error) + return error; + } + + /* Add a parent pointer from the file back to the lost+found. */ + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_addname(sc->tp, &adopt->ppargs, + sc->orphanage, adopt->xname, sc->ip); + if (error) + return error; + } + + /* + * Notify dirent hooks that we moved the file to /lost+found, and + * finish all the deferred work so that we know the adoption is fully + * recorded in the log. + */ + xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname); + + /* Remove negative dentries from the lost+found's dcache */ + xrep_adoption_zap_dcache(adopt); + return 0; +} + +/* + * Roll to a clean scrub transaction so that we can release the orphanage, + * even if xrep_adoption_move was not called. + * + * Commits all the work and deferred ops attached to an adoption request and + * rolls to a clean scrub transaction. On success, returns 0 with the scrub + * context holding a clean transaction with no inodes joined. On failure, + * returns negative errno with no scrub transaction. All inode locks are + * still held after this function returns. + */ +int +xrep_adoption_trans_roll( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + int error; + + trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip, + !!(sc->tp->t_flags & XFS_TRANS_DIRTY)); + + /* Finish all the deferred ops to commit all repairs. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + /* Roll the transaction once more to detach the inodes. */ + return xfs_trans_roll(&sc->tp); +} diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h new file mode 100644 index 000000000000..7c7a2e7d81db --- /dev/null +++ b/fs/xfs/scrub/orphanage.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ORPHANAGE_H__ +#define __XFS_SCRUB_ORPHANAGE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_orphanage_create(struct xfs_scrub *sc); + +/* + * If we're doing a repair, ensure that the orphanage exists and attach it to + * the scrub context. + */ +static inline int +xrep_orphanage_try_create( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR); + + error = xrep_orphanage_create(sc); + switch (error) { + case 0: + case -ENOENT: + case -ENOTDIR: + case -ENOSPC: + /* + * If the orphanage can't be found or isn't a directory, we'll + * keep going, but we won't be able to attach the file to the + * orphanage if we can't find the parent. + */ + return 0; + } + + return error; +} + +int xrep_orphanage_iolock_two(struct xfs_scrub *sc); + +void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc, + unsigned int ilock_flags); +void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + +void xrep_orphanage_rele(struct xfs_scrub *sc); + +/* Information about a request to add a file to the orphanage. */ +struct xrep_adoption { + struct xfs_scrub *sc; + + /* Name used for the adoption. */ + struct xfs_name *xname; + + /* Parent pointer context tracking */ + struct xfs_parent_args ppargs; + + /* Block reservations for orphanage and child (if directory). */ + unsigned int orphanage_blkres; + unsigned int child_blkres; + + /* + * Does the caller want us to bump the child link count? This is not + * needed when reattaching files that have become disconnected but have + * nlink > 1. It is necessary when changing the directory tree + * structure. + */ + bool bump_child_nlink:1; +}; + +bool xrep_orphanage_can_adopt(struct xfs_scrub *sc); + +int xrep_adoption_trans_alloc(struct xfs_scrub *sc, + struct xrep_adoption *adopt); +int xrep_adoption_compute_name(struct xrep_adoption *adopt, + struct xfs_name *xname); +int xrep_adoption_move(struct xrep_adoption *adopt); +int xrep_adoption_trans_roll(struct xrep_adoption *adopt); +#else +struct xrep_adoption { /* empty */ }; +# define xrep_orphanage_rele(sc) ((void)0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_ORPHANAGE_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index e6155d86f791..91e7b51ce068 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -10,19 +10,37 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/repair.h" +#include "scrub/listxattr.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/trace.h" /* Set us up to scrub parents. */ int xchk_setup_parent( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_parent(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } @@ -143,7 +161,8 @@ xchk_parent_validate( } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; - if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { + if (dp == sc->ip || xrep_is_tempfile(dp) || + !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } @@ -156,6 +175,16 @@ xchk_parent_validate( goto out_rele; } + /* + * We cannot yet validate this parent pointer if the directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -175,6 +204,621 @@ out_rele: return error; } +/* + * Checking of Parent Pointers + * =========================== + * + * On filesystems with directory parent pointers, we check the referential + * integrity by visiting each parent pointer of a child file and checking that + * the directory referenced by the pointer actually has a dirent pointing + * forward to the child file. + */ + +/* Deferred parent pointer entry that we saved for later. */ +struct xchk_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_pptrs { + struct xfs_scrub *sc; + + /* How many parent pointers did we find at the end? */ + unsigned long long pptrs_found; + + /* Parent of this directory. */ + xfs_ino_t parent_ino; + + /* Fixed-size array of xchk_pptr structures. */ + struct xfarray *pptr_entries; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */ + bool need_revalidate; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Does this parent pointer match the dotdot entry? */ +STATIC int +xchk_parent_scan_dotdot( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + if (pp->parent_ino == parent_ino) + return -ECANCELED; + + return 0; +} + +/* Look up the dotdot entry so that we can check it as we walk the pptrs. */ +STATIC int +xchk_parent_pptr_and_dotdot( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == sc->mp->m_rootip) { + if (sc->ip->i_ino != pp->parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* + * If this is now an unlinked directory, the dotdot value is + * meaningless as long as it points to a valid inode. + */ + if (VFS_I(sc->ip)->i_nlink == 0) + return 0; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Otherwise, walk the pptrs again, and check. */ + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp); + if (error == -ECANCELED) { + /* Found a parent pointer that matches dotdot. */ + return 0; + } + if (!error || error == -EFSCORRUPTED) { + /* Found a broken parent pointer or no match. */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + return error; +} + +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_lock_dir( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the forward link (dirent) associated with this parent pointer. */ +STATIC int +xchk_parent_dirent( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = pp->sc; + xfs_ino_t child_ino; + int error; + + /* + * Use the name attached to this parent pointer to look up the + * directory entry in the alleged parent. + */ + error = xchk_dir_lookup(sc, dp, xname, &child_ino); + if (error == -ENOENT) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* Does the inode number match? */ + if (child_ino != sc->ip->i_ino) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + return 0; +} + +/* Try to grab a parent directory. */ +STATIC int +xchk_parent_iget( + struct xchk_pptrs *pp, + const struct xfs_parent_rec *pptr, + struct xfs_inode **dpp) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *ip; + xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino); + int error; + + /* Validate inode number. */ + error = xfs_dir_ino_validate(sc->mp, parent_ino); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + error = xchk_iget(sc, parent_ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* The parent must be a directory. */ + if (!S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + /* Validate generation number. */ + if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + *dpp = ip; + return 0; +out_rele: + xchk_irele(sc, ip); + return 0; +} + +/* + * Walk an xattr of a file. If this xattr is a parent pointer, follow it up + * to a parent directory and check that the parent has a dirent pointing back + * to us. + */ +STATIC int +xchk_parent_scan_attr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_pptrs *pp = priv; + struct xfs_inode *dp = NULL; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + unsigned int lockmode; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return error; + } + + /* No self-referential parent pointers. */ + if (parent_ino == sc->ip->i_ino) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + pp->pptrs_found++; + + error = xchk_parent_iget(pp, pptr_rec, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* Try to lock the inode. */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (!lockmode) { + struct xchk_pptr save_pp = { + .pptr_rec = *pptr_rec, /* struct copy */ + .namelen = namelen, + }; + + /* Couldn't lock the inode, so save the pptr for later. */ + trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino); + + error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie, + &xname); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + error = xfarray_append(pp->pptr_entries, &save_pp); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + goto out_rele; + } + + error = xchk_parent_dirent(pp, &xname, dp); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* + * Revalidate a parent pointer that we collected in the past but couldn't check + * because of lock contention. Returns 0 if the parent pointer is still valid, + * -ENOENT if it has gone away on us, or a negative errno. + */ +STATIC int +xchk_parent_revalidate_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args); + if (error == -ENOATTR) { + /* Parent pointer went away, nothing to revalidate. */ + return -ENOENT; + } + + return error; +} + +/* + * Check a parent pointer the slow way, which means we cycle locks a bunch + * and put up with revalidation until we get it done. + */ +STATIC int +xchk_parent_slow_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *dp = NULL; + unsigned int lockmode; + int error; + + /* Check that the deferred parent pointer still exists. */ + if (pp->need_revalidate) { + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + return error; + } + + error = xchk_parent_iget(pp, pptr, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged parent, we + * can proceed with the validation. + */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (lockmode) { + trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino); + goto check_dirent; + } + + /* + * We couldn't lock the parent dir. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + pp->need_revalidate = true; + + trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode); + if (error) + goto out_rele; + + /* Revalidate the parent pointer now that we cycled locks. */ + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + goto out_unlock; + +check_dirent: + error = xchk_parent_dirent(pp, xname, dp); +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* Check all the parent pointers that we deferred the first time around. */ +STATIC int +xchk_parent_finish_slow_pptrs( + struct xchk_pptrs *pp) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(pp->pptr_entries, array_cur) { + struct xchk_pptr pptr; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(pp->pptr_entries, array_cur, &pptr); + if (error) + return error; + + error = xfblob_loadname(pp->pptr_names, pptr.name_cookie, + &pp->xname, pptr.namelen); + if (error) + return error; + + error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec); + if (error) + return error; + } + + /* Empty out both xfiles now that we've checked everything. */ + xfarray_truncate(pp->pptr_entries); + xfblob_truncate(pp->pptr_names); + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xchk_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + pp->pptrs_found++; + return 0; +} + +/* + * Compare the number of parent pointers to the link count. For + * non-directories these should be the same. For unlinked directories the + * count should be zero; for linked directories, it should be nonzero. + */ +STATIC int +xchk_parent_count_pptrs( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* + * If we cycled the ILOCK while cross-checking parent pointers with + * dirents, then we need to recalculate the number of parent pointers. + */ + if (pp->need_revalidate) { + pp->pptrs_found = 0; + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr, + NULL, pp); + if (error == -EFSCORRUPTED) { + /* Found a bad parent pointer */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (error) + return error; + } + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + if (sc->ip == sc->mp->m_rootip) + pp->pptrs_found++; + + if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + else if (VFS_I(sc->ip)->i_nlink > 0 && + pp->pptrs_found == 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return 0; +} + +/* Check parent pointers of a file. */ +STATIC int +xchk_parent_pptr( + struct xfs_scrub *sc) +{ + struct xchk_pptrs *pp; + char *descr; + int error; + + pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS); + if (!pp) + return -ENOMEM; + pp->sc = sc; + pp->xname.name = pp->namebuf; + + /* + * Set up some staging memory for parent pointers that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_pptr), + &pp->pptr_entries); + kfree(descr); + if (error) + goto out_pp; + + descr = xchk_xfile_ino_descr(sc, "slow parent pointer names"); + error = xfblob_create(descr, &pp->pptr_names); + kfree(descr); + if (error) + goto out_entries; + + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp); + if (error == -ECANCELED) { + error = 0; + goto out_names; + } + if (error) + goto out_names; + + error = xchk_parent_finish_slow_pptrs(pp); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_names; + + /* + * For subdirectories, make sure the dotdot entry references the same + * inode as the parent pointers. + * + * If we're scanning a /consistent/ directory, there should only be + * one parent pointer, and it should point to the same directory as + * the dotdot entry. + * + * However, a corrupt directory tree might feature a subdirectory with + * multiple parents. The directory loop scanner is responsible for + * correcting that kind of problem, so for now we only validate that + * the dotdot entry matches /one/ of the parents. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xchk_parent_pptr_and_dotdot(pp); + if (error) + goto out_names; + } + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_names; + + /* + * Complain if the number of parent pointers doesn't match the link + * count. This could be a sign of missing parent pointers (or an + * incorrect link count). + */ + error = xchk_parent_count_pptrs(pp); + if (error) + goto out_names; + +out_names: + xfblob_destroy(pp->pptr_names); +out_entries: + xfarray_destroy(pp->pptr_entries); +out_pp: + kvfree(pp); + return error; +} + /* Scrub a parent pointer. */ int xchk_parent( @@ -184,6 +828,9 @@ xchk_parent( xfs_ino_t parent_ino; int error = 0; + if (xfs_has_parent(mp)) + return xchk_parent_pptr(sc); + /* * If we're a directory, check that the '..' link points up to * a directory that has one entry pointing to us. @@ -217,6 +864,74 @@ xchk_parent( */ error = xchk_parent_validate(sc, parent_ino); } while (error == -EAGAIN); + if (error == -EBUSY) { + /* + * We could not scan a directory, so we marked the check + * incomplete. No further error return is necessary. + */ + return 0; + } return error; } + +/* + * Decide if this file's extended attributes (and therefore its parent + * pointers) have been zapped to satisfy the inode and ifork verifiers. + * Checking and repairing should be postponed until the extended attribute + * structure is fixed. + */ +bool +xchk_pptr_looks_zapped( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_has_parent(mp)); + + /* + * Temporary files that cannot be linked into the directory tree do not + * have attr forks because they cannot ever have parents. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return false; + + /* + * Directory tree roots do not have parents, so the expected outcome + * of a parent pointer scan is always the empty set. It's safe to scan + * them even if the attr fork was zapped. + */ + if (ip == mp->m_rootip) + return false; + + /* + * Metadata inodes are all rooted in the superblock and do not have + * any parents. Hence the attr fork will not be initialized, but + * there are no parent pointers that might have been zapped. + */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* + * Linked and linkable non-rootdir files should always have an + * attribute fork because that is where parent pointers are + * stored. If the fork is absent, something is amiss. + */ + if (!xfs_inode_has_attr_fork(ip)) + return true; + + /* Repair zapped this file's attr fork a short time ago */ + if (xfs_ifork_zapped(ip, XFS_ATTR_FORK)) + return true; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the bmapbta + * scrubber to reconstruct the block mappings. The extended attribute + * structure always contain some content when parent pointers are + * enabled, so this is a clear sign of a zapped attr fork. + */ + return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0; +} diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c new file mode 100644 index 000000000000..7b42b7f65a0b --- /dev/null +++ b/fs/xfs/scrub/parent_repair.c @@ -0,0 +1,1612 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/orphanage.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr_repair.h" +#include "scrub/listxattr.h" + +/* + * Repairing The Directory Parent Pointer + * ====================================== + * + * Currently, only directories support parent pointers (in the form of '..' + * entries), so we simply scan the filesystem and update the '..' entry. + * + * Note that because the only parent pointer is the dotdot entry, we won't + * touch an unhealthy directory, since the directory repair code is perfectly + * capable of rebuilding a directory with the proper parent inode. + * + * See the section on locking issues in dir_repair.c for more information about + * conflicts with the VFS. The findparent code wll keep our incore parent + * inode up to date. + * + * If parent pointers are enabled, we instead reconstruct the parent pointer + * information by visiting every directory entry of every directory in the + * system and translating the relevant dirents into parent pointers. In this + * case, it is advantageous to stash all parent pointers created from dirents + * from a single parent file before replaying them into the temporary file. To + * save memory, the live filesystem scan reuses the findparent object. Parent + * pointer repair chooses either directory scanning or findparent, but not + * both. + * + * When salvaging completes, the remaining stashed entries are replayed to the + * temporary file. All non-parent pointer extended attributes are copied to + * the temporary file's extended attributes. An atomic file mapping exchange + * is used to commit the new xattr blocks to the file being repaired. This + * will disrupt attrmulti cursors. + */ + +/* Create a parent pointer in the tempfile. */ +#define XREP_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered parent pointers in pptr_recs and + * pptr_names before we write them to the temp file. + */ +#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_parent { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Scratch buffers for saving extended attributes */ + unsigned char *xattr_name; + void *xattr_value; + unsigned int xattr_value_sz; + + /* + * Information used to exchange the attr fork mappings, if the fs + * supports parent pointers. + */ + struct xrep_tempexch tx; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. On filesystems without parent + * pointers, we use the findparent_* functions on this object and + * access only the parent_ino field directly. + * + * When parent pointers are enabled, the directory entry scanner uses + * the iscan, hooks, and lock fields of this object directly. + * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and + * pptr_scratch. This reduces the memory requirements of this + * structure. + * + * The lock also controls access to xattr_records and xattr_blobs(?) + */ + struct xrep_parent_scan_info pscan; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* Have we seen any live updates of parent pointers recently? */ + bool saw_pptr_updates; + + /* Number of parents we found after all other repairs */ + unsigned long long parents; +}; + +struct xrep_parent_xattr { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +/* Tear down all the incore stuff we created. */ +static void +xrep_parent_teardown( + struct xrep_parent *rp) +{ + xrep_findparent_scan_teardown(&rp->pscan); + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + if (rp->xattr_blobs) + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; + if (rp->xattr_records) + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; + if (rp->pptr_names) + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; + if (rp->pptr_recs) + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +} + +/* Set up for a parent repair. */ +int +xrep_setup_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS); + if (!rp) + return -ENOMEM; + rp->sc = sc; + rp->xname.name = rp->namebuf; + sc->buf = rp; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + return xrep_orphanage_try_create(sc); +} + +/* + * Scan all files in the filesystem for a child dirent that we can turn into + * the dotdot entry for this directory. + */ +STATIC int +xrep_parent_find_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int sick, checked; + int error; + + /* + * Avoid sick directories. There shouldn't be anyone else clearing the + * directory's sick status. + */ + xfs_inode_measure_sickness(sc->ip, &sick, &checked); + if (sick & XFS_SICK_INO_DIR) + return -EFSCORRUPTED; + + ino = xrep_findparent_self_reference(sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + return 0; + } + + /* + * Drop the ILOCK on this directory so that we can scan for the dotdot + * entry. Figure out who is going to be the parent of this directory, + * then retake the ILOCK so that we can salvage directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Does the VFS dcache have an answer for us? */ + ino = xrep_findparent_from_dcache(sc); + if (ino != NULLFSINO) { + error = xrep_findparent_confirm(sc, &ino); + if (!error && ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + goto out_relock; + } + } + + /* Scan the entire filesystem for a parent. */ + error = xrep_findparent_scan(&rp->pscan); +out_relock: + xchk_ilock(sc, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Add this stashed incore parent pointer to the temporary file. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_parent_replay_update( + struct xrep_parent *rp, + const struct xfs_name *xname, + struct xrep_pptr *pptr) +{ + struct xfs_scrub *sc = rp->sc; + + switch (pptr->action) { + case XREP_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_parent_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + case XREP_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_parent_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the parent pointer + * rebuild, since files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_parent_replay_updates( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rp->pscan.lock); + foreach_xfarray_idx(rp->pptr_recs, array_cur) { + struct xrep_pptr pptr; + + error = xfarray_load(rp->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rp->pptr_names, pptr.name_cookie, + &rp->xname, pptr.namelen); + if (error) + goto out_unlock; + rp->xname.len = pptr.namelen; + mutex_unlock(&rp->pscan.lock); + + error = xrep_parent_replay_update(rp, &rp->xname, &pptr); + if (error) + return error; + + mutex_lock(&rp->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->pptr_recs); + xfblob_truncate(rp->pptr_names); + mutex_unlock(&rp->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentadd( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentremove( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Examine an entry of a directory. If this dirent leads us back to the file + * whose parent pointers we're rebuilding, add a pptr to the temporary + * directory. + */ +STATIC int +xrep_parent_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + /* Dirent doesn't point to this directory. */ + if (ino != rp->sc->ip->i_ino) + return 0; + + /* No weird looking names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* No mismatching ftypes. */ + if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Transform this dirent into a parent pointer and queue it for later + * addition to the temporary file. + */ + mutex_lock(&rp->pscan.lock); + error = xrep_parent_stash_parentadd(rp, name, dp); + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Decide if we want to look for dirents in this directory. Skip the file + * being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_parent_want_scan( + struct xrep_parent *rp, + const struct xfs_inode *ip) +{ + return ip != rp->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt. + * Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_parent_scan_ilock( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Still need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_parent_want_scan(rp, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents that point to the file whose + * parent pointers we're rebuilding. + */ +STATIC int +xrep_parent_scan_file( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_parent_scan_ilock(rp, ip); + + if (!xrep_parent_want_scan(rp, ip)) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rp->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Decide if we've stashed too much pptr data in memory. */ +static inline bool +xrep_parent_want_flush_stashed( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names); + return bytes > XREP_PARENT_MAX_STASH_BYTES; +} + +/* + * Scan all directories in the filesystem to look for dirents that we can turn + * into parent pointers. + */ +STATIC int +xrep_parent_scan_dirtree( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip; + int error; + + /* + * Filesystem scans are time consuming. Drop the file ILOCK and all + * other resources for the duration of the scan and hope for the best. + * The live update hooks will keep our scan information up to date. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_parent_scan_file(rp, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed pptr updates to constrain memory usage. */ + mutex_lock(&rp->pscan.lock); + flush = xrep_parent_want_flush_stashed(rp); + mutex_unlock(&rp->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_parent_replay_updates(rp); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rp->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Retake sc->ip's ILOCK now that we're done flushing stashed parent + * pointers. We end this function with an empty transaction and the + * ILOCK. + */ + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * file being repaired. + */ +STATIC int +xrep_parent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent *rp; + struct xfs_scrub *sc; + int error; + + rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb); + sc = rp->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) { + mutex_lock(&rp->pscan.lock); + if (p->delta > 0) + error = xrep_parent_stash_parentadd(rp, p->name, p->dp); + else + error = xrep_parent_stash_parentremove(rp, p->name, + p->dp); + if (!error) + rp->saw_pptr_updates = true; + mutex_unlock(&rp->pscan.lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rp->pscan.iscan); + return NOTIFY_DONE; +} + +/* Reset a directory's dotdot entry, if needed. */ +STATIC int +xrep_parent_reset_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int spaceres; + int error = 0; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino); + if (error || ino == rp->pscan.parent_ino) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino); + + /* + * Reserve more space just in case we have to expand the dir. We're + * allowed to exceed quota to repair inconsistent metadata. + */ + spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len, + false); + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0, + true); + if (error) + return error; + + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + rp->pscan.parent_ino, spaceres); + if (error) + return error; + + /* + * Roll transaction to detach the inode from the transaction but retain + * ILOCK_EXCL. + */ + return xfs_trans_roll(&sc->tp); +} + +/* Pass back the parent inumber if this a parent pointer */ +STATIC int +xrep_parent_lookup_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + xfs_ino_t *inop = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + *inop = parent_ino; + return -ECANCELED; +} + +/* + * Find the first parent of the scrub target by walking parent pointers for + * the purpose of deciding if we're going to move it to the orphanage. + * We don't care if the attr fork is zapped. + */ +STATIC int +xrep_parent_lookup_pptrs( + struct xfs_scrub *sc, + xfs_ino_t *inop) +{ + int error; + + *inop = NULLFSINO; + + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL, + inop); + if (error && error != -ECANCELED) + return error; + return 0; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_parent_move_to_orphanage( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + /* + * We are about to drop the ILOCK on sc->ip to lock the + * orphanage and prepare for the adoption. Therefore, look up + * the old dotdot entry for sc->ip so that we can compare it + * after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &orig_parent); + if (error) + return error; + } else { + /* + * We haven't dropped the ILOCK since we committed the new + * xattr structure (and hence the new parent pointer records), + * which means that the file cannot have been moved in the + * directory tree, and there are no parents. + */ + orig_parent = NULLFSINO; + } + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rp->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rp->adoption, &rp->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. For a non-directory, we have to scan for + * the first parent pointer to see if one has been added. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &new_parent); + else + error = xrep_parent_lookup_pptrs(sc, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rp->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rp->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* Ensure that the xattr value buffer is large enough. */ +STATIC int +xrep_parent_alloc_xattr_value( + struct xrep_parent *rp, + size_t bufsize) +{ + void *new_val; + + if (rp->xattr_value_sz >= bufsize) + return 0; + + if (rp->xattr_value) { + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + rp->xattr_value_sz = 0; + } + + new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + rp->xattr_value = new_val; + rp->xattr_value_sz = bufsize; + return 0; +} + +/* Retrieve the (remote) value of a non-pptr xattr. */ +STATIC int +xrep_parent_fetch_xattr_remote( + struct xrep_parent *rp, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + unsigned int valuelen) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_da_args args = { + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = ip, + .name = name, + .namelen = namelen, + .trans = sc->tp, + .valuelen = valuelen, + .owner = ip->i_ino, + }; + int error; + + /* + * If we need a larger value buffer, try to allocate one. If that + * fails, return with -EDEADLOCK to try harder. + */ + error = xrep_parent_alloc_xattr_value(rp, valuelen); + if (error == -ENOMEM) + return -EDEADLOCK; + if (error) + return error; + + args.value = rp->xattr_value; + xfs_attr_sethash(&args); + return xfs_attr_get_ilocked(&args); +} + +/* Stash non-pptr attributes for later replay into the temporary file. */ +STATIC int +xrep_parent_stash_xattr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent_xattr key = { + .valuelen = valuelen, + .namelen = namelen, + .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + struct xrep_parent *rp = priv; + int error; + + if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT)) + return 0; + + if (!value) { + error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags, + name, namelen, valuelen); + if (error) + return error; + + value = rp->xattr_value; + } + + trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name, + key.namelen, key.valuelen); + + error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + return xfarray_append(rp->xattr_records, &key); +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_parent_insert_xattr( + struct xrep_parent *rp, + const struct xrep_parent_xattr *key) +{ + struct xfs_da_args args = { + .dp = rp->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rp->sc->ip->i_ino, + .geo = rp->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + int error; + + ASSERT(!(key->flags & XFS_ATTR_PARENT)); + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = rp->xattr_name; + args.value = rp->xattr_value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + rp->xattr_name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->value_cookie); + if (error) + return error; + + rp->xattr_name[key->namelen] = 0; + + trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags, + rp->xattr_name, key->namelen, key->valuelen); + + xfs_attr_sethash(&args); + return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false); +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_parent_flush_xattrs( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and the empty scrub + * transaction that we created for the xattr scan. We hold ILOCK_EXCL + * on the inode being repaired. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the ILOCK so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from adding xattrs (or parent pointers) while we're + * flushing. + */ + xchk_trans_cancel(rp->sc); + xchk_iunlock(rp->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rp->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rp->xattr_records, array_cur) { + struct xrep_parent_xattr key; + + error = xfarray_load(rp->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_parent_insert_xattr(rp, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->xattr_records); + xfblob_truncate(rp->xattr_blobs); + + xrep_tempfile_iounlock(rp->sc); + + /* Recreate the empty transaction and relock the inode. */ + error = xchk_trans_alloc_empty(rp->sc); + if (error) + return error; + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_parent_want_flush_xattrs( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->xattr_records) + + xfblob_bytes(rp->xattr_blobs); + return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES; +} + +/* Flush staged attributes to the temporary file if we're over the limit. */ +STATIC int +xrep_parent_try_flush_xattrs( + struct xfs_scrub *sc, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!xrep_parent_want_flush_xattrs(rp)) + return 0; + + error = xrep_parent_flush_xattrs(rp); + if (error) + return error; + + /* + * If there were any parent pointer updates to the xattr structure + * while we dropped the ILOCK, the xattr structure is now stale. + * Signal to the attr copy process that we need to start over, but + * this time without opportunistic attr flushing. + * + * This is unlikely to happen, so we're ok with restarting the copy. + */ + mutex_lock(&rp->pscan.lock); + if (rp->saw_pptr_updates) + error = -ESTALE; + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* Copy all the non-pptr extended attributes into the temporary file. */ +STATIC int +xrep_parent_copy_xattrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there + * can't be any parent pointer updates in progress. + */ + mutex_lock(&rp->pscan.lock); + rp->saw_pptr_updates = false; + mutex_unlock(&rp->pscan.lock); + + /* Copy xattrs, stopping periodically to flush the incore buffers. */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + xrep_parent_try_flush_xattrs, rp); + if (error && error != -ESTALE) + return error; + + if (error == -ESTALE) { + /* + * The xattr copy collided with a parent pointer update. + * Restart the copy, but this time hold the ILOCK all the way + * to the end to lock out any directory parent pointer updates. + */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + NULL, rp); + if (error) + return error; + } + + /* Flush any remaining stashed xattrs to the temporary file. */ + if (xfarray_bytes(rp->xattr_records) == 0) + return 0; + + return xrep_parent_flush_xattrs(rp); +} + +/* + * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head + * into the attr fork exchange transaction. All files on a filesystem with + * parent pointers must have an attr fork because the parent pointer code does + * not itself add attribute forks. + * + * Note: Unlinkable unlinked files don't need one, but the overhead of having + * an unnecessary attr fork is not justified by the additional code complexity + * that would be needed to track that state correctly. + */ +STATIC int +xrep_parent_ensure_attr_fork( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + error = xfs_attr_add_fork(sc->tempip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new attribute structure. + */ +STATIC int +xrep_parent_finalize_tempfile( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_parent_replay_updates(rp); + if (error) + return error; + + error = xrep_parent_ensure_attr_fork(rp); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx); + if (error) + return error; + + if (xfarray_length(rp->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Replay all the stashed parent pointers into the temporary file, copy all + * the non-pptr xattrs from the file being repaired into the temporary file, + * and exchange the attr fork contents atomically. + */ +STATIC int +xrep_parent_rebuild_pptrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t parent_ino = NULLFSINO; + int error; + + /* + * Copy non-ppttr xattrs from the file being repaired into the + * temporary file's xattr structure. We hold sc->ip's IOLOCK, which + * prevents setxattr/removexattr calls from occurring, but renames + * update the parent pointers without holding IOLOCK. If we detect + * stale attr structures, we restart the scan but only flush at the + * end. + */ + error = xrep_parent_copy_xattrs(rp); + if (error) + return error; + + /* + * Cancel the empty transaction that we used to walk and copy attrs, + * and drop the ILOCK so that we can take the IOLOCK on the temporary + * file. We still hold sc->ip's IOLOCK. + */ + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed pptr updates to the tempdir. After this point, + * we're ready to exchange the attr fork mappings. + */ + error = xrep_parent_finalize_tempfile(rp); + if (error) + return error; + + /* Last chance to abort before we start committing pptr fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rp->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the attr fork contents and junk the old attr fork contents, + * which are now in the tempfile. + */ + error = xrep_xattr_swap(sc, &rp->tx); + if (error) + return error; + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + + /* + * We've committed the new parent pointers. Find at least one parent + * so that we can decide if we're moving this file to the orphanage. + * For this purpose, root directories are their own parents. + */ + if (sc->ip == sc->mp->m_rootip) { + xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino); + } else { + error = xrep_parent_lookup_pptrs(sc, &parent_ino); + if (error) + return error; + if (parent_ino != NULLFSINO) + xrep_findparent_scan_found(&rp->pscan, parent_ino); + } + return 0; +} + +/* + * Commit the new parent pointer structure (currently only the dotdot entry) to + * the file that we're repairing. + */ +STATIC int +xrep_parent_rebuild_tree( + struct xrep_parent *rp) +{ + int error; + + if (xfs_has_parent(rp->sc->mp)) { + error = xrep_parent_rebuild_pptrs(rp); + if (error) + return error; + } + + if (rp->pscan.parent_ino == NULLFSINO) { + if (xrep_orphanage_can_adopt(rp->sc)) + return xrep_parent_move_to_orphanage(rp); + return -EFSCORRUPTED; + } + + if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode)) + return xrep_parent_reset_dotdot(rp); + + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xrep_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + rp->parents++; + return 0; +} + +/* + * After all parent pointer rebuilding and adoption activity completes, reset + * the link count of this nondirectory, having scanned the fs to rebuild all + * parent pointers. + */ +STATIC int +xrep_parent_set_nondir_nlink( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip = sc->ip; + struct xfs_perag *pag; + bool joined = false; + int error; + + /* Count parent pointers so we can reset the file link count. */ + rp->parents = 0; + error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp); + if (error) + return error; + + if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is on the unlinked list but we found parents. + * Remove the file from the unlinked list. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is not on the unlinked list but we found no + * parents. Add the file to the unlinked list. + */ + error = xfs_iunlink(sc->tp, ip); + if (error) + return error; + } + + /* Set the correct link count. */ + if (VFS_I(ip)->i_nlink != rp->parents) { + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents, + XFS_NLINK_PINNED)); + } + + /* Log the inode to keep it moving forward if we dirtied anything. */ + if (joined) + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + return 0; +} + +/* Set up the filesystem scan so we can look for parents. */ +STATIC int +xrep_parent_setup_scan( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + char *descr; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + int max_len; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_findparent_scan_start(sc, &rp->pscan); + + /* Buffers for copying non-pptr attrs to the tempfile */ + rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!rp->xattr_name) + return -ENOMEM; + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values, so + * TRY_HARDER means we allocate the maximal attr value size. + */ + if (sc->flags & XCHK_TRY_HARDER) + max_len = XATTR_SIZE_MAX; + else + max_len = xfs_attr_leaf_entsize_local_max(geo->blksize); + error = xrep_parent_alloc_xattr_value(rp, max_len); + if (error) + goto out_xattr_name; + + /* Set up some staging memory for logging parent pointer updates. */ + descr = xchk_xfile_ino_descr(sc, "parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_pptr), + &rp->pptr_recs); + kfree(descr); + if (error) + goto out_xattr_value; + + descr = xchk_xfile_ino_descr(sc, "parent pointer names"); + error = xfblob_create(descr, &rp->pptr_names); + kfree(descr); + if (error) + goto out_recs; + + /* Set up some storage for copying attrs before the mapping exchange */ + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr), + &rp->xattr_records); + kfree(descr); + if (error) + goto out_names; + + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr values"); + error = xfblob_create(descr, &rp->xattr_blobs); + kfree(descr); + if (error) + goto out_attr_keys; + + error = __xrep_findparent_scan_start(sc, &rp->pscan, + xrep_parent_live_update); + if (error) + goto out_attr_values; + + return 0; + +out_attr_values: + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; +out_attr_keys: + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; +out_names: + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; +out_recs: + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +out_xattr_value: + kvfree(rp->xattr_value); + rp->xattr_value = NULL; +out_xattr_name: + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + return error; +} + +int +xrep_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp = sc->buf; + int error; + + /* + * When the parent pointers feature is enabled, repairs are committed + * by atomically committing a new xattr structure and reaping the old + * attr fork. Reaping requires rmap and exchange-range to be enabled. + */ + if (xfs_has_parent(sc->mp)) { + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + } + + error = xrep_parent_setup_scan(rp); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_parent_scan_dirtree(rp); + else + error = xrep_parent_find_dotdot(rp); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing dotdot fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_parent_rebuild_tree(rp); + if (error) + goto out_teardown; + if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xrep_parent_set_nondir_nlink(rp); + if (error) + goto out_teardown; + } + + error = xrep_defer_finish(sc); + +out_teardown: + xrep_parent_teardown(rp); + return error; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 5671c8153433..183d531875ea 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -17,9 +18,10 @@ #include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline xfs_dqtype_t +xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -75,14 +77,70 @@ struct xchk_quota_info { xfs_dqid_t last_id; }; +/* There's a written block backing this dquot, right? */ +STATIC int +xchk_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + xfs_fileoff_t offset) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + if (!xfs_verify_fileoff(mp, offset)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (dq->q_fileoffset != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps != 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (!xfs_verify_fsbno(mp, irec.br_startblock)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (!xfs_bmap_is_written_extent(&irec)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + return 0; +} + +/* Complain if a quota timer is incorrectly set. */ +static inline void +xchk_quota_item_timer( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + const struct xfs_dquot_res *res) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else { + if (res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( - struct xfs_dquot *dq, - xfs_dqtype_t dqtype, - void *priv) + struct xchk_quota_info *sqi, + struct xfs_dquot *dq) { - struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -94,6 +152,17 @@ xchk_quota_item( return error; /* + * We want to validate the bmap record for the storage backing this + * dquot, so we need to lock the dquot and the quota file. For quota + * operations, the locking order is first the ILOCK and then the dquot. + * However, dqiterate gave us a locked dquot, so drop the dquot lock to + * get the ILOCK. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_SHARED); + xfs_dqlock(dq); + + /* * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ @@ -103,6 +172,11 @@ xchk_quota_item( sqi->last_id = dq->q_id; + error = xchk_quota_item_bmap(sc, dq, offset); + xchk_iunlock(sc, XFS_ILOCK_SHARED); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + return error; + /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems @@ -166,6 +240,10 @@ xchk_quota_item( dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + xchk_quota_item_timer(sc, offset, &dq->q_blk); + xchk_quota_item_timer(sc, offset, &dq->q_ino); + xchk_quota_item_timer(sc, offset, &dq->q_rtb); + out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -191,7 +269,7 @@ xchk_quota_data_fork( return error; /* Check for data fork problems that apply only to quota files. */ - max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) @@ -218,9 +296,11 @@ int xchk_quota( struct xfs_scrub *sc) { - struct xchk_quota_info sqi; + struct xchk_dqiter cursor = { }; + struct xchk_quota_info sqi = { .sc = sc }; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; xfs_dqtype_t dqtype; int error = 0; @@ -239,10 +319,15 @@ xchk_quota( * functions. */ xchk_iunlock(sc, sc->ilock_flags); - sqi.sc = sc; - sqi.last_id = 0; - error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - xchk_ilock(sc, XFS_ILOCK_EXCL); + + /* Now look for things that the quota verifiers won't complain about. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xchk_quota_item(&sqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h new file mode 100644 index 000000000000..6c7134ce2385 --- /dev/null +++ b/fs/xfs/scrub/quota.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTA_H__ +#define __XFS_SCRUB_QUOTA_H__ + +xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc); + +/* dquot iteration code */ + +struct xchk_dqiter { + struct xfs_scrub *sc; + + /* Quota file that we're walking. */ + struct xfs_inode *quota_ip; + + /* Cached data fork mapping for the dquot. */ + struct xfs_bmbt_irec bmap; + + /* The next dquot to scan. */ + uint64_t id; + + /* Quota type (user/group/project). */ + xfs_dqtype_t dqtype; + + /* Data fork sequence number to detect stale mappings. */ + unsigned int if_seq; +}; + +void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc, + xfs_dqtype_t dqtype); +int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp); + +#endif /* __XFS_SCRUB_QUOTA_H__ */ diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c new file mode 100644 index 000000000000..cd51f10f2920 --- /dev/null +++ b/fs/xfs/scrub/quota_repair.c @@ -0,0 +1,568 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Quota Repair + * ============ + * + * Quota repairs are fairly simplistic; we fix everything that the dquot + * verifiers complain about, cap any counters or limits that make no sense, + * and schedule a quotacheck if we had to fix anything. We also repair any + * data fork extent records that don't apply to metadata files. + */ + +struct xrep_quota_info { + struct xfs_scrub *sc; + bool need_quotacheck; +}; + +/* + * Allocate a new block into a sparse hole in the quota file backing this + * dquot, initialize the block, and commit the whole mess. + */ +STATIC int +xrep_quota_item_fill_bmap_hole( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + struct xfs_bmbt_irec *irec) +{ + struct xfs_buf *bp; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Map a block into the file. */ + error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp), + 0); + if (error) + return error; + + error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, + irec, &nmaps); + if (error) + return error; + + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); + + trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id); + + /* Initialize the new block. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_dquot_buf_ops; + + xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * Finish the mapping transactions and roll one more time to + * disconnect sc->ip from sc->tp. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + return xfs_trans_roll(&sc->tp); +} + +/* Make sure there's a written block backing this dquot */ +STATIC int +xrep_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + bool *dirty) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk; + int nmaps = 1; + int error; + + /* The computed file offset should always be valid. */ + if (!xfs_verify_fileoff(mp, offset)) { + ASSERT(xfs_verify_fileoff(mp, offset)); + return -EFSCORRUPTED; + } + dq->q_fileoffset = offset; + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) { + /* Hole/delalloc extent; allocate a real block. */ + error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec); + if (error) + return error; + } else if (irec.br_state != XFS_EXT_NORM) { + /* Unwritten extent, which we already took care of? */ + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EFSCORRUPTED; + } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) { + /* + * If the cached daddr is incorrect, repair probably punched a + * hole out of the quota file and filled it back in with a new + * block. Update the block mapping in the dquot. + */ + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock); + } + + *dirty = true; + return 0; +} + +/* Reset quota timers if incorrectly set. */ +static inline void +xrep_quota_item_timer( + struct xfs_scrub *sc, + const struct xfs_dquot_res *res, + bool *dirty) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + *dirty = true; + } else { + if (res->timer) + *dirty = true; + } +} + +/* Scrub the fields in an individual quota item. */ +STATIC int +xrep_quota_item( + struct xrep_quota_info *rqi, + struct xfs_dquot *dq) +{ + struct xfs_scrub *sc = rqi->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t fs_icount; + bool dirty = false; + int error = 0; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * We might need to fix holes in the bmap record for the storage + * backing this dquot, so we need to lock the dquot and the quota file. + * dqiterate gave us a locked dquot, so drop the dquot lock to get the + * ILOCK_EXCL. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_dqlock(dq); + + error = xrep_quota_item_bmap(sc, dq, &dirty); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* Check the limits. */ + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { + dq->q_blk.softlimit = dq->q_blk.hardlimit; + dirty = true; + } + + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) { + dq->q_ino.softlimit = dq->q_ino.hardlimit; + dirty = true; + } + + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) { + dq->q_rtb.softlimit = dq->q_rtb.hardlimit; + dirty = true; + } + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. We don't know what the real number + * is, but we can make quotacheck find out for us. + */ + if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) { + dq->q_blk.reserved -= dq->q_blk.count; + dq->q_blk.reserved += mp->m_sb.sb_dblocks; + dq->q_blk.count = mp->m_sb.sb_dblocks; + rqi->need_quotacheck = true; + dirty = true; + } + fs_icount = percpu_counter_sum(&mp->m_icount); + if (dq->q_ino.count > fs_icount) { + dq->q_ino.reserved -= dq->q_ino.count; + dq->q_ino.reserved += fs_icount; + dq->q_ino.count = fs_icount; + rqi->need_quotacheck = true; + dirty = true; + } + if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + dq->q_rtb.reserved -= dq->q_rtb.count; + dq->q_rtb.reserved += mp->m_sb.sb_rblocks; + dq->q_rtb.count = mp->m_sb.sb_rblocks; + rqi->need_quotacheck = true; + dirty = true; + } + + xrep_quota_item_timer(sc, &dq->q_blk, &dirty); + xrep_quota_item_timer(sc, &dq->q_ino, &dirty); + xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); + + if (!dirty) + return 0; + + trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); + + dq->q_flags |= XFS_DQFLAG_DIRTY; + xfs_trans_dqjoin(sc->tp, dq); + if (dq->q_id) { + xfs_qm_adjust_dqlimits(dq); + xfs_qm_adjust_dqtimers(dq); + } + xfs_trans_log_dquot(sc->tp, dq); + error = xfs_trans_roll(&sc->tp); + xfs_dqlock(dq); + return error; +} + +/* Fix a quota timer so that we can pass the verifier. */ +STATIC void +xrep_quota_fix_timer( + struct xfs_mount *mp, + const struct xfs_disk_dquot *ddq, + __be64 softlimit, + __be64 countnow, + __be32 *timer, + time64_t timelimit) +{ + uint64_t soft = be64_to_cpu(softlimit); + uint64_t count = be64_to_cpu(countnow); + time64_t new_timer; + uint32_t t; + + if (!soft || count <= soft || *timer != 0) + return; + + new_timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + timelimit); + if (ddq->d_type & XFS_DQTYPE_BIGTIME) + t = xfs_dq_unix_to_bigtime(new_timer); + else + t = new_timer; + + *timer = cpu_to_be32(t); +} + +/* Fix anything the verifiers complain about. */ +STATIC int +xrep_quota_block( + struct xfs_scrub *sc, + xfs_daddr_t daddr, + xfs_dqtype_t dqtype, + xfs_dqid_t id) +{ + struct xfs_dqblk *dqblk; + struct xfs_disk_dquot *ddq; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype); + struct xfs_buf *bp = NULL; + enum xfs_blft buftype = 0; + int i; + int error; + + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr, + qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Failed verifier, retry read with no ops. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, + sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen, + 0, &bp, NULL); + if (error) + return error; + break; + case 0: + dqblk = bp->b_addr; + ddq = &dqblk[0].dd_diskdq; + + /* + * If there's nothing that would impede a dqiterate, we're + * done. + */ + if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype || + id == be32_to_cpu(ddq->d_id)) { + xfs_trans_brelse(sc->tp, bp); + return 0; + } + break; + default: + return error; + } + + /* Something's wrong with the block, fix the whole thing. */ + dqblk = bp->b_addr; + bp->b_ops = &xfs_dquot_buf_ops; + for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) { + ddq = &dqblk->dd_diskdq; + + trace_xrep_disk_dquot(sc->mp, dqtype, id + i); + + ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddq->d_version = XFS_DQUOT_VERSION; + ddq->d_type = dqtype; + ddq->d_id = cpu_to_be32(id + i); + + if (xfs_has_bigtime(sc->mp) && ddq->d_id) + ddq->d_type |= XFS_DQTYPE_BIGTIME; + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit, + ddq->d_bcount, &ddq->d_btimer, + defq->blk.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit, + ddq->d_icount, &ddq->d_itimer, + defq->ino.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit, + ddq->d_rtbcount, &ddq->d_rtbtimer, + defq->rtb.time); + + /* We only support v5 filesystems so always set these. */ + uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + dqblk->dd_lsn = 0; + } + switch (dqtype) { + case XFS_DQTYPE_USER: + buftype = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + buftype = XFS_BLFT_GDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + buftype = XFS_BLFT_PDQUOT_BUF; + break; + } + xfs_trans_buf_set_type(sc->tp, bp, buftype); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return xrep_roll_trans(sc); +} + +/* + * Repair a quota file's data fork. The function returns with the inode + * joined. + */ +STATIC int +xrep_quota_data_fork( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off; + xfs_fsblock_t fsbno; + bool truncate = false; + bool joined = false; + int error = 0; + + error = xrep_metadata_inode_forks(sc); + if (error) + goto out; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (isnullstartblock(irec.br_startblock)) { + error = -EFSCORRUPTED; + goto out; + } + + if (irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + truncate = true; + break; + } + + /* Convert unwritten extents to real ones. */ + if (irec.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec nrec; + int nmap = 1; + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + error = xfs_bmapi_write(sc->tp, sc->ip, + irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT, 0, &nrec, &nmap); + if (error) + goto out; + ASSERT(nrec.br_startoff == irec.br_startoff); + ASSERT(nrec.br_blockcount == irec.br_blockcount); + + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + } + } + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + if (truncate) { + /* Erase everything after the block containing the max dquot */ + error = xfs_bunmapi_range(&sc->tp, sc->ip, 0, + max_dqid_off * sc->mp->m_sb.sb_blocksize, + XFS_MAX_FILEOFF); + if (error) + goto out; + + /* Remove all CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0, + XFS_MAX_FILEOFF, true); + if (error) + goto out; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* + * Always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Now go fix anything that fails the verifiers. */ + for_each_xfs_iext(ifp, &icur, &irec) { + for (fsbno = irec.br_startblock, off = irec.br_startoff; + fsbno < irec.br_startblock + irec.br_blockcount; + fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB, + off += XFS_DQUOT_CLUSTER_SIZE_FSB) { + error = xrep_quota_block(sc, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + dqtype, off * qi->qi_dqperchunk); + if (error) + goto out; + } + } + +out: + return error; +} + +/* + * Go fix anything in the quota items that we could have been mad about. Now + * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to + * use the regular dquot functions. + */ +STATIC int +xrep_quota_problems( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xrep_quota_info rqi = { .sc = sc }; + struct xfs_dquot *dq; + int error; + + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xrep_quota_item(&rqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Make a quotacheck happen. */ + if (rqi.need_quotacheck) + xrep_force_quotacheck(sc, dqtype); + return 0; +} + +/* Repair all of a quota type's items. */ +int +xrep_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + dqtype = xchk_quota_to_dqtype(sc); + + /* + * Re-take the ILOCK so that we can fix any problems that we found + * with the data fork mappings, or with the dquot bufs themselves. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = xrep_quota_data_fork(sc, dqtype); + if (error) + return error; + + /* + * Finish deferred items and roll the transaction to unjoin the quota + * inode from transaction so that we can unlock the quota inode; we + * play only with dquots from now on. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xchk_iunlock(sc, sc->ilock_flags); + + /* Fix anything the dquot verifiers don't complain about. */ + error = xrep_quota_problems(sc, dqtype); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c new file mode 100644 index 000000000000..c77eb2de8df7 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck + * =============== + * + * Quota counters are "summary" metadata, in the sense that they are computed + * as the summation of the block usage counts for every file on the filesystem. + * Therefore, we compute the correct icount, bcount, and rtbcount values by + * creating a shadow quota counter structure and walking every inode. + */ + +/* Track the quota deltas for a dquot in a transaction. */ +struct xqcheck_dqtrx { + xfs_dqtype_t q_type; + xfs_dqid_t q_id; + + int64_t icount_delta; + + int64_t bcount_delta; + int64_t delbcnt_delta; + + int64_t rtbcount_delta; + int64_t delrtb_delta; +}; + +#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS) + +/* + * Track the quota deltas for all dquots attached to a transaction if the + * quota deltas are being applied to an inode that we already scanned. + */ +struct xqcheck_dqacct { + struct rhash_head hash; + uintptr_t tx_id; + struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS]; + unsigned int refcount; +}; + +/* Free a shadow dquot accounting structure. */ +static void +xqcheck_dqacct_free( + void *ptr, + void *arg) +{ + struct xqcheck_dqacct *dqa = ptr; + + kfree(dqa); +} + +/* Set us up to scrub quota counters. */ +int +xchk_setup_quotacheck( + struct xfs_scrub *sc) +{ + if (!XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; + + xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA); + + sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached + * to each inode, we create a shadow dquot, and compute the inode count and add + * the data/rt block usage from what we see. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the quota counters for an inode that we've already + * scanned. This will cause our counts to be incorrect. Therefore, we hook + * the live transaction code in two places: (1) when the callers update the + * per-transaction dqtrx structure to log quota counter updates; and (2) when + * transaction commit actually logs those updates to the incore dquot. By + * shadowing transaction updates in this manner, live quotacheck can ensure + * by locking the dquot and the shadow structure that its own copies are not + * out of date. Because the hook code runs in a different process context from + * the scrub code and the scrub state flags are not accessed atomically, + * failures in the hook code must abort the iscan and the scrubber must notice + * the aborted scan and set the incomplete flag. + * + * Note that we use srcu notifier hooks to minimize the overhead when live + * quotacheck is /not/ running. + */ + +/* Update an incore dquot counter information from a live update. */ +static int +xqcheck_update_incore_counts( + struct xqcheck *xqc, + struct xfarray *counts, + xfs_dqid_t id, + int64_t inodes, + int64_t nblks, + int64_t rtblks) +{ + struct xqcheck_dquot xcdq; + int error; + + error = xfarray_load_sparse(counts, id, &xcdq); + if (error) + return error; + + xcdq.flags |= XQCHECK_DQUOT_WRITTEN; + xcdq.icount += inodes; + xcdq.bcount += nblks; + xcdq.rtbcount += rtblks; + + error = xfarray_store(counts, id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* Decide if this is the shadow dquot accounting structure for a transaction. */ +static int +xqcheck_dqacct_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const uintptr_t *tx_idp = arg->key; + const struct xqcheck_dqacct *dqa = obj; + + if (dqa->tx_id != *tx_idp) + return 1; + return 0; +} + +static const struct rhashtable_params xqcheck_dqacct_hash_params = { + .min_size = 32, + .key_len = sizeof(uintptr_t), + .key_offset = offsetof(struct xqcheck_dqacct, tx_id), + .head_offset = offsetof(struct xqcheck_dqacct, hash), + .automatic_shrinking = true, + .obj_cmpfn = xqcheck_dqacct_obj_cmpfn, +}; + +/* Find a shadow dqtrx slot for the given dquot. */ +STATIC struct xqcheck_dqtrx * +xqcheck_get_dqtrx( + struct xqcheck_dqacct *dqa, + xfs_dqtype_t q_type, + xfs_dqid_t q_id) +{ + int i; + + for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) { + if (dqa->dqtrx[i].q_type == 0 || + (dqa->dqtrx[i].q_type == q_type && + dqa->dqtrx[i].q_id == q_id)) + return &dqa->dqtrx[i]; + } + + return NULL; +} + +/* + * Create and fill out a quota delta tracking structure to shadow the updates + * going on in the regular quota code. + */ +static int +xqcheck_mod_live_ino_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_mod_ino_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb); + + /* Skip quota reservation fields. */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_ICOUNT: + case XFS_TRANS_DQ_RTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: + break; + default: + return NOTIFY_DONE; + } + + /* Ignore dqtrx updates for quota types we don't care about. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + if (!xqc->ucounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_GROUP: + if (!xqc->gcounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_PROJ: + if (!xqc->pcounts) + return NOTIFY_DONE; + break; + default: + return NOTIFY_DONE; + } + + /* Skip inodes that haven't been scanned yet. */ + if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino)) + return NOTIFY_DONE; + + /* Make a shadow quota accounting tracker for this transaction. */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) { + dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS); + if (!dqa) + goto out_abort; + + dqa->tx_id = p->tx_id; + error = rhashtable_insert_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + } + + /* Find the shadow dqtrx (or an empty slot) here. */ + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx) + goto out_abort; + if (dqtrx->q_type == 0) { + dqtrx->q_type = p->q_type; + dqtrx->q_id = p->q_id; + dqa->refcount++; + } + + /* Update counter */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + dqtrx->bcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELBCOUNT: + dqtrx->delbcnt_delta += p->delta; + break; + case XFS_TRANS_DQ_ICOUNT: + dqtrx->icount_delta += p->delta; + break; + case XFS_TRANS_DQ_RTBCOUNT: + dqtrx->rtbcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELRTBCOUNT: + dqtrx->delrtb_delta += p->delta; + break; + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* + * Apply the transaction quota deltas to our shadow quota accounting info when + * the regular quota code are doing the same. + */ +static int +xqcheck_apply_live_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_apply_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + struct xfarray *counts; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb); + + /* Map the dquot type to an incore counter object. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + counts = xqc->ucounts; + break; + case XFS_DQTYPE_GROUP: + counts = xqc->gcounts; + break; + case XFS_DQTYPE_PROJ: + counts = xqc->pcounts; + break; + default: + return NOTIFY_DONE; + } + + if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL) + return NOTIFY_DONE; + + /* + * Find the shadow dqtrx for this transaction and dquot, if any deltas + * need to be applied here. If not, we're finished early. + */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) + goto out_unlock; + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx || dqtrx->q_type == 0) + goto out_unlock; + + /* Update our shadow dquot if we're committing. */ + if (action == XFS_APPLY_DQTRX_COMMIT) { + error = xqcheck_update_incore_counts(xqc, counts, p->q_id, + dqtrx->icount_delta, + dqtrx->bcount_delta + dqtrx->delbcnt_delta, + dqtrx->rtbcount_delta + dqtrx->delrtb_delta); + if (error) + goto out_abort; + } + + /* Free the shadow accounting structure if that was the last user. */ + dqa->refcount--; + if (dqa->refcount == 0) { + error = rhashtable_remove_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + xqcheck_dqacct_free(dqa, NULL); + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_unlock: + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* Record this inode's quota usage in our shadow quota counter data. */ +STATIC int +xqcheck_collect_inode( + struct xqcheck *xqc, + struct xfs_inode *ip) +{ + struct xfs_trans *tp = xqc->sc->tp; + xfs_filblks_t nblks, rtblks; + uint ilock_flags = 0; + xfs_dqid_t id; + bool isreg = S_ISREG(VFS_I(ip)->i_mode); + int error = 0; + + if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + /* + * Quota files are never counted towards quota, so we do not + * need to take the lock. + */ + xchk_iscan_mark_visited(&xqc->iscan, ip); + return 0; + } + + /* Figure out the data / rt device block counts. */ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (isreg) + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Read in the data fork for rt files so that _count_blocks + * can count the number of blocks allocated from the rt volume. + * Inodes do not track that separately. + */ + ilock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_abort; + } else { + ilock_flags = XFS_ILOCK_SHARED; + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + /* Update the shadow dquot counters. */ + mutex_lock(&xqc->lock); + if (xqc->ucounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER); + error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->gcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP); + error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->pcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ); + error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + mutex_unlock(&xqc->lock); + + xchk_iscan_mark_visited(&xqc->iscan, ip); + goto out_ilock; + +out_mutex: + mutex_unlock(&xqc->lock); +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_incomplete: + xchk_set_incomplete(xqc->sc); +out_ilock: + xfs_iunlock(ip, ilock_flags); + if (isreg) + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; +} + +/* Walk all the allocated inodes and run a quota scan on them. */ +STATIC int +xqcheck_collect_counts( + struct xqcheck *xqc) +{ + struct xfs_scrub *sc = xqc->sc; + struct xfs_inode *ip; + int error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done without + * risk of deadlock between sb_internal and the IOLOCK (we take the + * IOLOCK to quiesce the file before scanning) because empty + * transactions do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) { + error = xqcheck_collect_inode(xqc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xqc->iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing + * the resource usage counters against our shadow dquots; and then walk each + * shadow dquot (that wasn't covered in the first part), comparing it against + * the xfs_dquot. + */ + +/* + * Check the dquot data against what we observed. Caller must hold the dquot + * lock. + */ +STATIC int +xqcheck_compare_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int error; + + if (xchk_iscan_aborted(&xqc->iscan)) { + xchk_set_incomplete(xqc->sc); + return -ECANCELED; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + if (xcdq.icount != dq->q_ino.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.bcount != dq->q_blk.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.rtbcount != dq->q_rtb.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * should never happen outside of the collection phase. + */ + xchk_set_incomplete(xqc->sc); + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error) + return error; + + if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + + return 0; + +out_unlock: + mutex_unlock(&xqc->lock); + return error; +} + +/* + * Walk all the observed dquots, and make sure there's a matching incore + * dquot and that its counts match ours. + */ +STATIC int +xqcheck_walk_observations( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xqcheck_dquot xcdq; + struct xfs_dquot *dq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq); + if (error == -ENOENT) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, id); + return 0; + } + if (error) + return error; + + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + if (xchk_should_terminate(xqc->sc, &error)) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Compare the quota counters we observed against the live dquots. */ +STATIC int +xqcheck_compare_dqtype( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xfs_scrub *sc = xqc->sc; + struct xfs_dquot *dq; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* If the quota CHKD flag is cleared, we need to repair this quota. */ + if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0); + return 0; + } + + /* Compare what we observed against the actual dquots. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Walk all the observed dquots and compare to the incore ones. */ + return xqcheck_walk_observations(xqc, dqtype); +} + +/* Tear down everything associated with a quotacheck. */ +static void +xqcheck_teardown_scan( + void *priv) +{ + struct xqcheck *xqc = priv; + struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xqc->iscan); + + /* + * As noted above, the apply hook is responsible for cleaning up the + * shadow dquot accounting data when a transaction completes. The mod + * hook must be removed before the apply hook so that we don't + * mistakenly leave an active shadow account for the mod hook to get + * its hands on. No hooks should be running after these functions + * return. + */ + xfs_dqtrx_hook_del(qi, &xqc->qhook); + + if (xqc->shadow_dquot_acct.key_len) { + rhashtable_free_and_destroy(&xqc->shadow_dquot_acct, + xqcheck_dqacct_free, NULL); + xqc->shadow_dquot_acct.key_len = 0; + } + + if (xqc->pcounts) { + xfarray_destroy(xqc->pcounts); + xqc->pcounts = NULL; + } + + if (xqc->gcounts) { + xfarray_destroy(xqc->gcounts); + xqc->gcounts = NULL; + } + + if (xqc->ucounts) { + xfarray_destroy(xqc->ucounts); + xqc->ucounts = NULL; + } + + xchk_iscan_teardown(&xqc->iscan); + mutex_destroy(&xqc->lock); + xqc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate quota counter data. + * If the scan is successful, the quota data will be left alive for a repair. + * If any error occurs, we'll tear everything down. + */ +STATIC int +xqcheck_setup_scan( + struct xfs_scrub *sc, + struct xqcheck *xqc) +{ + char *descr; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL; + int error; + + ASSERT(xqc->sc == NULL); + xqc->sc = sc; + + mutex_init(&xqc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xqc->iscan); + + error = -ENOMEM; + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) { + descr = xchk_xfile_descr(sc, "user dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->ucounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) { + descr = xchk_xfile_descr(sc, "group dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->gcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) { + descr = xchk_xfile_descr(sc, "project dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->pcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + /* + * Set up hash table to map transactions to our internal shadow dqtrx + * structures. + */ + error = rhashtable_init(&xqc->shadow_dquot_acct, + &xqcheck_dqacct_hash_params); + if (error) + goto out_teardown; + + /* + * Hook into the quota code. The hook only triggers for inodes that + * were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + * + * The apply hook (which removes the shadow dquot accounting struct) + * must be installed before the mod hook so that we never fail to catch + * the end of a quota update sequence and leave stale shadow data. + */ + ASSERT(sc->flags & XCHK_FSGATES_QUOTA); + xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx, + xqcheck_apply_live_dqtrx); + + error = xfs_dqtrx_hook_add(qi, &xqc->qhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the quota count data to repair. */ + sc->buf_cleanup = xqcheck_teardown_scan; + return 0; + +out_teardown: + xqcheck_teardown_scan(xqc); + return error; +} + +/* Scrub all counters for a given quota type. */ +int +xchk_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + int error = 0; + + /* Check quota counters on the live filesystem. */ + error = xqcheck_setup_scan(sc, xqc); + if (error) + return error; + + /* Walk all inodes, picking up quota information. */ + error = xqcheck_collect_counts(xqc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare quota counters. */ + if (xqc->ucounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->gcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->pcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h new file mode 100644 index 000000000000..4ea5f249c978 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTACHECK_H__ +#define __XFS_SCRUB_QUOTACHECK_H__ + +/* Quota counters for live quotacheck. */ +struct xqcheck_dquot { + /* block usage count */ + int64_t bcount; + + /* inode usage count */ + int64_t icount; + + /* realtime block usage count */ + int64_t rtbcount; + + /* Record state */ + unsigned int flags; +}; + +/* + * This incore dquot record has been written at least once. We never want to + * store an xqcheck_dquot that looks uninitialized. + */ +#define XQCHECK_DQUOT_WRITTEN (1U << 0) + +/* Already checked this dquot. */ +#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1) + +/* Already repaired this dquot. */ +#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2) + +/* Live quotacheck control structure. */ +struct xqcheck { + struct xfs_scrub *sc; + + /* Shadow dquot counter data. */ + struct xfarray *ucounts; + struct xfarray *gcounts; + struct xfarray *pcounts; + + /* Lock protecting quotacheck count observations */ + struct mutex lock; + + struct xchk_iscan iscan; + + /* Hooks into the quota code. */ + struct xfs_dqtrx_hook qhook; + + /* Shadow quota delta tracking structure. */ + struct rhashtable shadow_dquot_acct; +}; + +/* Return the incore counter array for a given quota type. */ +static inline struct xfarray * +xqcheck_counters_for( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + switch (dqtype) { + case XFS_DQTYPE_USER: + return xqc->ucounts; + case XFS_DQTYPE_GROUP: + return xqc->gcounts; + case XFS_DQTYPE_PROJ: + return xqc->pcounts; + } + + ASSERT(0); + return NULL; +} + +#endif /* __XFS_SCRUB_QUOTACHECK_H__ */ diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c new file mode 100644 index 000000000000..dd8554c755b5 --- /dev/null +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck Repair + * ====================== + * + * Use the live quota counter information that we collected to replace the + * counter values in the incore dquots. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * dquot is locked. + */ + +/* Commit new counters to a dquot. */ +static int +xqcheck_commit_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int64_t delta; + bool dirty = false; + int error = 0; + + /* Unlock the dquot just long enough to allocate a transaction. */ + xfs_dqunlock(dq); + error = xchk_trans_alloc(xqc->sc, 0); + xfs_dqlock(dq); + if (error) + return error; + + xfs_trans_dqjoin(xqc->sc->tp, dq); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_cancel; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + /* Adjust counters as needed. */ + delta = (int64_t)xcdq.icount - dq->q_ino.count; + if (delta) { + dq->q_ino.reserved += delta; + dq->q_ino.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.bcount - dq->q_blk.count; + if (delta) { + dq->q_blk.reserved += delta; + dq->q_blk.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count; + if (delta) { + dq->q_rtb.reserved += delta; + dq->q_rtb.count += delta; + dirty = true; + } + + xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the repair + * and must cancel the whole operation. This should never + * happen, but we need to catch it anyway. + */ + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error || !dirty) + goto out_cancel; + + trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id); + + /* Commit the dirty dquot to disk. */ + dq->q_flags |= XFS_DQFLAG_DIRTY; + if (dq->q_id) + xfs_qm_adjust_dqtimers(dq); + xfs_trans_log_dquot(xqc->sc->tp, dq); + + /* + * Transaction commit unlocks the dquot, so we must re-lock it so that + * the caller can put the reference (which apparently requires a locked + * dquot). + */ + error = xrep_trans_commit(xqc->sc); + xfs_dqlock(dq); + return error; + +out_unlock: + mutex_unlock(&xqc->lock); +out_cancel: + xchk_trans_cancel(xqc->sc); + + /* Re-lock the dquot so the caller can put the reference. */ + xfs_dqlock(dq); + return error; +} + +/* Commit new quota counters for a particular quota type. */ +STATIC int +xqcheck_commit_dqtype( + struct xqcheck *xqc, + unsigned int dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xqcheck_dquot xcdq; + struct xfs_scrub *sc = xqc->sc; + struct xfs_mount *mp = sc->mp; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + struct xfs_dquot *dq; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + /* + * Update the counters of every dquot that the quota file knows about. + */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* + * Make a second pass to deal with the dquots that we know about but + * the quota file previously did not know about. + */ + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + /* + * Grab the dquot, allowing for dquot block allocation in a + * separate transaction. We committed the scrub transaction + * in a previous step, so we will not be creating nested + * transactions here. + */ + error = xfs_qm_dqget(mp, id, dqtype, true, &dq); + if (error) + return error; + + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Figure out quota CHKD flags for the running quota types. */ +static inline unsigned int +xqcheck_chkd_flags( + struct xfs_mount *mp) +{ + unsigned int ret = 0; + + if (XFS_IS_UQUOTA_ON(mp)) + ret |= XFS_UQUOTA_CHKD; + if (XFS_IS_GQUOTA_ON(mp)) + ret |= XFS_GQUOTA_CHKD; + if (XFS_IS_PQUOTA_ON(mp)) + ret |= XFS_PQUOTA_CHKD; + return ret; +} + +/* Commit the new dquot counters. */ +int +xrep_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + unsigned int qflags = xqcheck_chkd_flags(sc->mp); + int error; + + /* + * Clear the CHKD flag for the running quota types and commit the scrub + * transaction so that we can allocate new quota block mappings if we + * have to. If we crash after this point, the sb still has the CHKD + * flags cleared, so mount quotacheck will fix all of this up. + */ + xrep_update_qflags(sc, qflags, 0); + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Commit the new counters to the dquots. */ + if (xqc->ucounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER); + if (error) + return error; + } + if (xqc->gcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP); + if (error) + return error; + } + if (xqc->pcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ); + if (error) + return error; + } + + /* Set the CHKD flags now that we've fixed quota counts. */ + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + xrep_update_qflags(sc, 0, qflags); + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c new file mode 100644 index 000000000000..e1e52bc20713 --- /dev/null +++ b/fs/xfs/scrub/rcbag.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/rcbag_btree.h" +#include "scrub/rcbag.h" +#include "scrub/trace.h" + +struct rcbag { + struct xfs_mount *mp; + struct xfbtree xfbtree; + uint64_t nr_items; +}; + +int +rcbag_init( + struct xfs_mount *mp, + struct xfs_buftarg *btp, + struct rcbag **bagp) +{ + struct rcbag *bag; + int error; + + bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS); + if (!bag) + return -ENOMEM; + + bag->nr_items = 0; + bag->mp = mp; + + error = rcbagbt_mem_init(mp, &bag->xfbtree, btp); + if (error) + goto out_bag; + + *bagp = bag; + return 0; + +out_bag: + kfree(bag); + return error; +} + +void +rcbag_free( + struct rcbag **bagp) +{ + struct rcbag *bag = *bagp; + + xfbtree_destroy(&bag->xfbtree); + kfree(bag); + *bagp = NULL; +} + +/* Track an rmap in the refcount bag. */ +int +rcbag_add( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = rcbagbt_lookup_eq(cur, rmap, &has); + if (error) + goto out_cur; + + if (has) { + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bagrec.rbg_refcount++; + error = rcbagbt_update(cur, &bagrec); + if (error) + goto out_cur; + } else { + bagrec.rbg_startblock = rmap->rm_startblock; + bagrec.rbg_blockcount = rmap->rm_blockcount; + bagrec.rbg_refcount = 1; + + error = rcbagbt_insert(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + } + + xfs_btree_del_cursor(cur, 0); + + error = xfbtree_trans_commit(&bag->xfbtree, tp); + if (error) + return error; + + bag->nr_items++; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Return the number of records in the bag. */ +uint64_t +rcbag_count( + const struct rcbag *rcbag) +{ + return rcbag->nr_items; +} + +static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r) +{ + return r->rbg_startblock + r->rbg_blockcount; +} + +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +int +rcbag_next_edge( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, + bool next_valid, + uint32_t *next_bnop) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + uint32_t next_bno = NULLAGBLOCK; + int has; + int error; + + if (next_valid) + next_bno = next_rmap->rm_startblock; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec)); + } + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (next_bno == NULLAGBLOCK) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_btree_del_cursor(cur, 0); + + *next_bnop = next_bno; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* Pop all refcount bag records that end at next_bno */ +int +rcbag_remove_ending_at( + struct rcbag *bag, + struct xfs_trans *tp, + uint32_t next_bno) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + /* go to the right edge of the tree */ + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_decrement(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + if (rcbag_rec_next_bno(&bagrec) != next_bno) + continue; + + error = xfs_btree_delete(cur, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bag->nr_items -= bagrec.rbg_refcount; + } + + xfs_btree_del_cursor(cur, 0); + return xfbtree_trans_commit(&bag->xfbtree, tp); +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Dump the rcbag. */ +void +rcbag_dump( + struct rcbag *bag, + struct xfs_trans *tp) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + unsigned long long nr = 0; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n", + nr++, + (unsigned int)bagrec.rbg_startblock, + (unsigned int)bagrec.rbg_blockcount, + (unsigned long long)bagrec.rbg_refcount); + } + +out_cur: + xfs_btree_del_cursor(cur, error); +} diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h new file mode 100644 index 000000000000..e29ef788ba72 --- /dev/null +++ b/fs/xfs/scrub/rcbag.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RCBAG_H__ +#define __XFS_SCRUB_RCBAG_H__ + +struct xfs_mount; +struct rcbag; +struct xfs_buftarg; + +int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp, + struct rcbag **bagp); +void rcbag_free(struct rcbag **bagp); +int rcbag_add(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap); +uint64_t rcbag_count(const struct rcbag *bag); + +int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, bool next_valid, + uint32_t *next_bnop); +int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp, + uint32_t next_bno); + +void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp); + +#endif /* __XFS_SCRUB_RCBAG_H__ */ diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c new file mode 100644 index 000000000000..709356dc6256 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/rcbag_btree.h" +#include "scrub/trace.h" + +static struct kmem_cache *rcbagbt_cur_cache; + +STATIC void +rcbagbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + struct rcbag_key *bag_key = (struct rcbag_key *)key; + const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec; + + BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key)); + BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec)); + + bag_key->rbg_startblock = bag_rec->rbg_startblock; + bag_key->rbg_blockcount = bag_rec->rbg_blockcount; +} + +STATIC void +rcbagbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec; + struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec; + + bag_rec->rbg_startblock = bag_irec->rbg_startblock; + bag_rec->rbg_blockcount = bag_irec->rbg_blockcount; + bag_rec->rbg_refcount = bag_irec->rbg_refcount; +} + +STATIC int64_t +rcbagbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + const struct rcbag_key *kp = (const struct rcbag_key *)key; + + if (kp->rbg_startblock > rec->rbg_startblock) + return 1; + if (kp->rbg_startblock < rec->rbg_startblock) + return -1; + + if (kp->rbg_blockcount > rec->rbg_blockcount) + return 1; + if (kp->rbg_blockcount < rec->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int64_t +rcbagbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + ASSERT(mask == NULL); + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 1; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return -1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 1; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int +rcbagbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 0; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return 1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 0; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return 1; + + return 0; +} + +STATIC int +rcbagbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1; + const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2; + + if (rp1->rbg_startblock > rp2->rbg_startblock) + return 0; + if (rp1->rbg_startblock < rp2->rbg_startblock) + return 1; + + if (rp1->rbg_blockcount > rp2->rbg_blockcount) + return 0; + if (rp1->rbg_blockcount < rp2->rbg_blockcount) + return 1; + + return 0; +} + +static xfs_failaddr_t +rcbagbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= rcbagbt_maxlevels_possible()) + return __this_address; + + maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +rcbagbt_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = rcbagbt_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops rcbagbt_mem_buf_ops = { + .name = "rcbagbt_mem", + .magic = { 0, cpu_to_be32(RCBAG_MAGIC) }, + .verify_read = rcbagbt_rw_verify, + .verify_write = rcbagbt_rw_verify, + .verify_struct = rcbagbt_verify, +}; + +static const struct xfs_btree_ops rcbagbt_mem_ops = { + .name = "rcbag", + .type = XFS_BTREE_TYPE_MEM, + + .rec_len = sizeof(struct rcbag_rec), + .key_len = sizeof(struct rcbag_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = 1, + .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = rcbagbt_init_key_from_rec, + .init_rec_from_cur = rcbagbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = rcbagbt_key_diff, + .buf_ops = &rcbagbt_mem_buf_ops, + .diff_two_keys = rcbagbt_diff_two_keys, + .keys_inorder = rcbagbt_keys_inorder, + .recs_inorder = rcbagbt_recs_inorder, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +rcbagbt_mem_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfbtree *xfbtree) +{ + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops, + rcbagbt_maxlevels_possible(), rcbagbt_cur_cache); + + cur->bc_mem.xfbtree = xfbtree; + cur->bc_nlevels = xfbtree->nlevels; + return cur; +} + +/* Create an in-memory refcount bag btree. */ +int +rcbagbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp) +{ + xfbt->owner = 0; + return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops); +} + +/* Calculate number of records in a refcount bag btree block. */ +static inline unsigned int +rcbagbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct rcbag_rec); + return blocklen / + (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t)); +} + +/* + * Calculate number of records in an refcount bag btree block. + */ +unsigned int +rcbagbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= RCBAG_BLOCK_LEN; + return rcbagbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for refcount bag btrees. */ +unsigned int +rcbagbt_maxlevels_possible(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_space_to_height(minrecs, ULLONG_MAX); +} + +/* Calculate the refcount bag btree size for some records. */ +unsigned long long +rcbagbt_calc_size( + unsigned long long nr_records) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_calc_size(minrecs, nr_records); +} + +int __init +rcbagbt_init_cur_cache(void) +{ + rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur", + xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()), + 0, 0, NULL); + + if (!rcbagbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +rcbagbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(rcbagbt_cur_cache); + rcbagbt_cur_cache = NULL; +} + +/* Look up the refcount bag record corresponding to this reverse mapping. */ +int +rcbagbt_lookup_eq( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, + int *success) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + + rec->rbg_startblock = rmap->rm_startblock; + rec->rbg_blockcount = rmap->rm_blockcount; + + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success); +} + +/* Get the data from the pointed-to record. */ +int +rcbagbt_get_rec( + struct xfs_btree_cur *cur, + struct rcbag_rec *rec, + int *has) +{ + union xfs_btree_rec *btrec; + int error; + + error = xfs_btree_get_rec(cur, &btrec, has); + if (error || !(*has)) + return error; + + memcpy(rec, btrec, sizeof(struct rcbag_rec)); + return 0; +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_update( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec) +{ + union xfs_btree_rec btrec; + + memcpy(&btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_update(cur, &btrec); +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_insert( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec, + int *success) +{ + struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec; + + memcpy(btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_insert(cur, success); +} diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h new file mode 100644 index 000000000000..03cadb032552 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RCBAG_BTREE_H__ +#define __XFS_SCRUB_RCBAG_BTREE_H__ + +#ifdef CONFIG_XFS_BTREE_IN_MEM + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */ + +struct rcbag_key { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; +}; + +struct rcbag_rec { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; + uint64_t rbg_refcount; +}; + +typedef __be64 rcbag_ptr_t; + +/* reflinks only exist on crc enabled filesystems */ +#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define RCBAG_REC_ADDR(block, index) \ + ((struct rcbag_rec *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct rcbag_rec)))) + +#define RCBAG_KEY_ADDR(block, index) \ + ((struct rcbag_key *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct rcbag_key))) + +#define RCBAG_PTR_ADDR(block, index, maxrecs) \ + ((rcbag_ptr_t *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (maxrecs) * sizeof(struct rcbag_key) + \ + ((index) - 1) * sizeof(rcbag_ptr_t))) + +unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); + +unsigned long long rcbagbt_calc_size(unsigned long long nr_records); + +unsigned int rcbagbt_maxlevels_possible(void); + +int __init rcbagbt_init_cur_cache(void); +void rcbagbt_destroy_cur_cache(void); + +struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp); + +int rcbagbt_lookup_eq(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, int *success); +int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has); +int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec); +int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec, + int *success); + +#else +# define rcbagbt_init_cur_cache() 0 +# define rcbagbt_destroy_cur_cache() ((void)0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */ diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index e51c1544be63..01c9a2dc0f2c 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_error.h" #include "scrub/scrub.h" +#include "scrub/common.h" #include "scrub/readdir.h" /* Call a function for every entry in a shortform directory. */ @@ -36,16 +37,14 @@ xchk_dir_walk_sf( struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_sf_entry *sfep; - struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_ino_t ino; xfs_dir2_dataptr_t dapos; unsigned int i; int error; ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* dot entry */ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, @@ -101,7 +100,7 @@ xchk_dir_walk_block( unsigned int off, next_off, end; int error; - error = xfs_dir3_block_read(sc->tp, dp, &bp); + error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp); if (error) return error; @@ -177,7 +176,7 @@ xchk_read_leaf_dir_buf( if (new_off > *curoff) *curoff = new_off; - return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); + return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp); } /* Call a function for every entry in a leaf directory. */ @@ -275,32 +274,27 @@ xchk_dir_walk( .dp = dp, .geo = dp->i_mount->m_dir_geo, .trans = sc->tp, + .owner = dp->i_ino, }; - bool isblock; int error; if (xfs_is_shutdown(dp->i_mount)) return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + switch (xfs_dir2_format(&args, &error)) { + case XFS_DIR2_FMT_SF: return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); - - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; - - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; - - if (isblock) + case XFS_DIR2_FMT_BLOCK: return xchk_dir_walk_block(sc, dp, dirent_fn, priv); - - return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + case XFS_DIR2_FMT_LEAF: + case XFS_DIR2_FMT_NODE: + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + default: + return error; + } } /* @@ -326,50 +320,102 @@ xchk_dir_lookup( .hashval = xfs_dir2_hashname(dp->i_mount, name), .whichfork = XFS_DATA_FORK, .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, }; - bool isblock, isleaf; int error; if (xfs_is_shutdown(dp->i_mount)) return -EIO; + /* + * A temporary directory's block headers are written with the owner + * set to sc->ip, so we must switch the owner here for the lookup. + */ + if (dp == sc->tempip) + args.owner = sc->ip->i_ino; + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - error = xfs_dir2_sf_lookup(&args); - goto out_check_rval; - } + error = xfs_dir_lookup_args(&args); + if (!error) + *ino = args.inumber; + return error; +} - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; +/* + * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock + * state. The caller may have a transaction, so we must use trylock for both + * IOLOCKs. + */ +static inline unsigned int +xchk_dir_trylock_both( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + goto parent_iolock; - if (isblock) { - error = xfs_dir2_block_lookup(&args); - goto out_check_rval; - } + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + goto parent_ilock; - error = xfs_dir2_isleaf(&args, &isleaf); - if (error) - return error; + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; - if (isleaf) { - error = xfs_dir2_leaf_lookup(&args); - goto out_check_rval; +parent_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); +parent_iolock: + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target + * (@sc->ip) and the inode at the other end (@ip) of a directory or parent + * pointer link so that we can check that link. + * + * We do not know ahead of time that the directory tree is /not/ corrupt, so we + * cannot use the "lock two inode" functions because we do not know that there + * is not a racing thread trying to take the locks in opposite order. First + * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED + * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub + * target and @ip to synchronize with XFS. + * + * If the trylocks succeed, *lockmode will be set to the locks held for @ip; + * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will + * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if + * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed. + */ +int +xchk_dir_trylock_for_pptrs( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int *lockmode) +{ + unsigned int nr; + int error = 0; + + ASSERT(sc->ilock_flags == 0); + + for (nr = 0; nr < HZ; nr++) { + *lockmode = xchk_dir_trylock_both(sc, ip); + if (*lockmode) + return 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); } - error = xfs_dir2_node_lookup(&args); + if (sc->flags & XCHK_TRY_HARDER) { + xchk_set_incomplete(sc); + return -ETIMEDOUT; + } -out_check_rval: - if (error == -EEXIST) - error = 0; - if (!error) - *ino = args.inumber; - return error; + return -EDEADLOCK; } diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h index 55787f4df123..da501877a64d 100644 --- a/fs/xfs/scrub/readdir.h +++ b/fs/xfs/scrub/readdir.h @@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *ino); +int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int *lockmode); + #endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 822f5adf7f7c..53697f3c5e1b 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -20,6 +20,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" @@ -31,11 +32,14 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/fsb_bitmap.h" #include "scrub/reap.h" /* @@ -73,10 +77,10 @@ * with only the same rmap owner but the block is not owned by something with * the same rmap owner, the block will be freed. * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We must also invalidate any buffers associated with - * @bitmap. + * The caller is responsible for locking the AG headers/inode for the entire + * rebuild operation so that nothing else can sneak in and change the incore + * state while we're not looking. We must also invalidate any buffers + * associated with @bitmap. */ /* Information about reaping extents after a repair. */ @@ -110,7 +114,7 @@ xreap_put_freelist( int error; /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); + error = xrep_fix_freelist(sc, 0); if (error) return error; @@ -207,6 +211,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs) rs->force_roll = false; } +/* + * Compute the maximum length of a buffer cache scan (in units of sectors), + * given a quantity of fs blocks. + */ +xfs_daddr_t +xrep_bufscan_max_sectors( + struct xfs_mount *mp, + xfs_extlen_t fsblocks) +{ + int max_fsbs; + + /* Remote xattr values are the largest buffers that we support. */ + max_fsbs = xfs_attr3_max_rmt_blocks(mp); + + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); +} + +/* + * Return an incore buffer from a sector scan, or NULL if there are no buffers + * left to return. + */ +struct xfs_buf * +xrep_bufscan_advance( + struct xfs_mount *mp, + struct xrep_bufscan *scan) +{ + scan->__sector_count += scan->daddr_step; + while (scan->__sector_count <= scan->max_sectors) { + struct xfs_buf *bp = NULL; + int error; + + error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr, + scan->__sector_count, XBF_LIVESCAN, &bp); + if (!error) + return bp; + + scan->__sector_count += scan->daddr_step; + } + + return NULL; +} + /* Try to invalidate the incore buffers for an extent that we're freeing. */ STATIC void xreap_agextent_binval( @@ -237,28 +283,15 @@ xreap_agextent_binval( * of any plausible size. */ while (bno < agbno_next) { - xfs_agblock_t fsbcount; - xfs_agblock_t max_fsbs; - - /* - * Max buffer size is the max remote xattr buffer size, which - * is one fs block larger than 64k. - */ - max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, - xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - - for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { - struct xfs_buf *bp = NULL; - xfs_daddr_t daddr; - int error; - - daddr = XFS_AGB_TO_DADDR(mp, agno, bno); - error = xfs_buf_incore(mp->m_ddev_targp, daddr, - XFS_FSB_TO_BB(mp, fsbcount), - XBF_LIVESCAN, &bp); - if (error) - continue; + struct xrep_bufscan scan = { + .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + agbno_next - bno), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); rs->invalidated++; @@ -377,6 +410,17 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); rs->force_roll = true; + + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + /* + * If we're unmapping CoW staging extents, remove the + * records from the refcountbt, which will remove the + * rmap record as well. + */ + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + return 0; + } + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, *aglenp, rs->oinfo); } @@ -395,6 +439,26 @@ xreap_agextent_iter( return 0; } + /* + * If we're getting rid of CoW staging extents, use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, + rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + /* Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); @@ -409,13 +473,17 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, - rs->resv, true); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; } @@ -425,13 +493,12 @@ xreap_agextent_iter( */ STATIC int xreap_agmeta_extent( - uint64_t fsbno, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xreap_state *rs = priv; struct xfs_scrub *sc = rs->sc; - xfs_agblock_t agbno = fsbno; xfs_agblock_t agbno_next = agbno + len; int error = 0; @@ -496,3 +563,488 @@ xrep_reap_agblocks( return 0; } + +/* + * Break a file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * not cross an AG boundary. + */ +STATIC int +xreap_fsmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip != NULL); + ASSERT(!sc->sa.pag); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + goto out_agf; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + goto out_agf; + + if (xreap_want_defer_finish(rs)) { + /* + * Holds the AGF buffer across the deferred chain + * processing. + */ + error = xrep_defer_finish(sc); + if (error) + goto out_agf; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + /* + * Hold the AGF buffer across the transaction roll so + * that we don't have to reattach it to the scrub + * context. + */ + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + if (error) + goto out_agf; + xreap_reset(rs); + } + + agbno += aglen; + } + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of every block of every fs metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} + +/* + * Metadata files are not supposed to share blocks with anything else. + * If blocks are shared, we remove the reverse mapping (thus reducing the + * crosslink factor); if blocks are not shared, we also need to free them. + * + * This first step determines the longest subset of the passed-in imap + * (starting at its beginning) that is either crosslinked or not crosslinked. + * The blockcount will be adjust down as needed. + */ +STATIC int +xreap_bmapi_select( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool *crosslinked) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + xfs_filblks_t len = 1; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); + if (error) + goto out_cur; + + bno = agbno + 1; + while (bno < agbno_next) { + bool also_crosslinked; + + oinfo.oi_offset++; + error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (also_crosslinked != *crosslinked) + break; + + len++; + bno++; + } + + imap->br_blockcount = len; + trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Decide if this buffer can be joined to a transaction. This is true for most + * buffers, but there are two cases that we want to catch: large remote xattr + * value buffers are not logged and can overflow the buffer log item dirty + * bitmap size; and oversized cached buffers if things have really gone + * haywire. + */ +static inline bool +xreap_buf_loggable( + const struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < bp->b_map_count; i++) { + int chunks; + int map_size; + + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) + return false; + } + + return true; +} + +/* + * Invalidate any buffers for this file mapping. The @imap blockcount may be + * adjusted downward if we need to roll the transaction. + */ +STATIC int +xreap_bmapi_binval( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + int bmap_flags = xfs_bmapi_aflag(whichfork); + xfs_fileoff_t off; + xfs_fileoff_t max_off; + xfs_extlen_t scan_blocks; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + unsigned int invalidated = 0; + int error; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return 0; + + /* + * Buffers for file blocks can span multiple contiguous mappings. This + * means that for each block in the mapping, there could exist an + * xfs_buf indexed by that block with any length up to the maximum + * buffer size (remote xattr values) or to the next hole in the fork. + * To set up our binval scan, first we need to figure out the location + * of the next hole. + */ + off = imap->br_startoff + imap->br_blockcount; + max_off = off + xfs_attr3_max_rmt_blocks(mp); + while (off < max_off) { + struct xfs_bmbt_irec hmap; + int nhmaps = 1; + + error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + &nhmaps, bmap_flags); + if (error) + return error; + if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + if (!xfs_bmap_is_real_extent(&hmap)) + break; + + off = hmap.br_startoff + hmap.br_blockcount; + } + scan_blocks = off - imap->br_startoff; + + trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks); + + /* + * If there are incore buffers for these blocks, invalidate them. If + * we can't (try)lock the buffer we assume it's owned by someone else + * and leave it alone. The buffer cache cannot detect aliasing, so + * employ nested loops to detect incore buffers of any plausible size. + */ + while (bno < agbno_next) { + struct xrep_bufscan scan = { + .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + scan_blocks), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + if (xreap_buf_loggable(bp)) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } else { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * much of the mapping we've seen so far. + */ + if (invalidated > XREAP_MAX_BINVAL) { + imap->br_blockcount = agbno_next - bno; + goto out; + } + } + + bno++; + scan_blocks--; + } + +out: + trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); + return 0; +} + +/* + * Dispose of as much of the beginning of this file fork mapping as possible. + * The number of blocks disposed of is returned in @imap->br_blockcount. + */ +STATIC int +xrep_reap_bmapi_iter( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool crosslinked) +{ + int error; + + if (crosslinked) { + /* + * If there are other rmappings, this block is cross linked and + * must not be freed. Remove the reverse mapping, leave the + * buffer cache in its possibly confused state, and move on. + * We don't want to risk discarding valid data buffers from + * anybody else who thinks they own the block, even though that + * runs the risk of stale buffer warnings in the future. + */ + trace_xreap_dispose_unmap_extent(sc->sa.pag, + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Schedule removal of the mapping from the fork. We use + * deferred log intents in this function to control the exact + * sequence of metadata updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + return 0; + } + + /* + * If the block is not crosslinked, we can invalidate all the incore + * buffers for the extent, and then free the extent. This is a bit of + * a mess since we don't detect discontiguous buffers that are indexed + * by a block starting before the first block of the extent but overlap + * anyway. + */ + trace_xreap_dispose_free_extent(sc->sa.pag, + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Invalidate as many buffers as we can, starting at the beginning of + * this mapping. If this function sets blockcount to zero, the + * transaction is full of logged buffer invalidations, so we need to + * return early so that we can roll and retry. + */ + error = xreap_bmapi_binval(sc, ip, whichfork, imap); + if (error || imap->br_blockcount == 0) + return error; + + /* + * Schedule removal of the mapping from the fork. We use deferred log + * intents in this function to control the exact sequence of metadata + * updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + return xfs_free_extent_later(sc->tp, imap->br_startblock, + imap->br_blockcount, NULL, XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_SKIP_DISCARD); +} + +/* + * Dispose of as much of this file extent as we can. Upon successful return, + * the imap will reflect the mapping that was removed from the fork. + */ +STATIC int +xreap_ifork_extent( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + xfs_agnumber_t agno; + bool crosslinked; + int error; + + ASSERT(sc->sa.pag == NULL); + + trace_xreap_ifork_extent(sc, ip, whichfork, imap); + + agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + /* + * Decide the fate of the blocks at the beginning of the mapping, then + * update the mapping to use it with the unmap calls. + */ + error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + if (error) + goto out_agf; + + error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + if (error) + goto out_agf; + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of each block mapped to the given fork of the given file. Callers + * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork + * must not have any delalloc reservations. + */ +int +xrep_reap_ifork( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork) +{ + xfs_fileoff_t off = 0; + int bmap_flags = xfs_bmapi_aflag(whichfork); + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(ip == sc->ip || ip == sc->tempip); + ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + + while (off < XFS_MAX_FILEOFF) { + struct xfs_bmbt_irec imap; + int nimaps = 1; + + /* Read the next extent, skip past holes and delalloc. */ + error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap, + &nimaps, bmap_flags); + if (error) + return error; + if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + /* + * If this is a real space mapping, reap as much of it as we + * can in a single transaction. + */ + if (xfs_bmap_is_real_extent(&imap)) { + error = xreap_ifork_extent(sc, ip, whichfork, &imap); + if (error) + return error; + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + off = imap.br_startoff + imap.br_blockcount; + } + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index fe24626af164..3f2f1775e29d 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -6,7 +6,33 @@ #ifndef __XFS_SCRUB_REAP_H__ #define __XFS_SCRUB_REAP_H__ +struct xagb_bitmap; +struct xfsb_bitmap; + int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); +int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); +int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); + +/* Buffer cache scan context. */ +struct xrep_bufscan { + /* Disk address for the buffers we want to scan. */ + xfs_daddr_t daddr; + + /* Maximum number of sectors to scan. */ + xfs_daddr_t max_sectors; + + /* Each round, increment the search length by this number of sectors. */ + xfs_daddr_t daddr_step; + + /* Internal scan state; initialize to zero. */ + xfs_daddr_t __sector_count; +}; + +xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp, + xfs_extlen_t fsblocks); +struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp, + struct xrep_bufscan *scan); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 304ea1e1bfb0..cccf39d917a0 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -7,8 +7,10 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" @@ -17,6 +19,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* * Set us up to scrub reference count btrees. @@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt( { if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_refcountbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } @@ -441,7 +453,7 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -478,7 +490,7 @@ xchk_refcount_xref_rmap( struct xfs_scrub *sc, xfs_filblks_t cow_blocks) { - xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t refcbt_blocks = 0; xfs_filblks_t blocks; int error; diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 000000000000..a00d7ce7ae5b --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,751 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" +#include "scrub/rcbag.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Set us up to repair refcount btrees. */ +int +xrep_setup_ag_refcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xfs_rmap_irec *rmap, + bool *have_rec) +{ + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap->rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, rmap)); + + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct rcbag *rcstack, + xfs_agblock_t bno, + struct xfs_rmap_irec *rmap, + bool *have) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); + if (error) + return error; + + error = xrep_refc_walk_rmaps(rr, rmap, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sa.rmap_cur); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct rcbag *rcstack; + uint64_t old_stack_height; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If the size of the bag exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + struct xfs_rmap_irec rmap; + + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rmap.rm_startblock; + error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_height = rcbag_count(rcstack); + + /* While stack isn't empty... */ + while (rcbag_count(rcstack) > 0) { + /* Pop all rmaps that end at nbno */ + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_height); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (rcbag_count(rcstack) == 0) + break; + old_stack_height = rcbag_count(rcstack); + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(rcbag_count(rcstack) == 0); +out_bag: + rcbag_free(&rcstack); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b8b5439f2d7..155bbaaa496e 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -27,12 +27,23 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_defer.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_reflink.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/stats.h" +#include "scrub/xfile.h" +#include "scrub/attr_repair.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -176,6 +187,16 @@ xrep_roll_ag_trans( return 0; } +/* Roll the scrub transaction, holding the primary metadata locked. */ +int +xrep_roll_trans( + struct xfs_scrub *sc) +{ + if (!sc->ip) + return xrep_roll_ag_trans(sc); + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + /* Finish all deferred work attached to the repair transaction. */ int xrep_defer_finish( @@ -274,7 +295,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(pag, NULL, &bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); @@ -387,7 +408,7 @@ xrep_calc_ag_resblks( int xrep_fix_freelist( struct xfs_scrub *sc, - bool can_shrink) + int alloc_flags) { struct xfs_alloc_arg args = {0}; @@ -397,8 +418,7 @@ xrep_fix_freelist( args.alignment = 1; args.pag = sc->sa.pag; - return xfs_alloc_fix_freelist(&args, - can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); + return xfs_alloc_fix_freelist(&args, alloc_flags); } /* @@ -673,6 +693,45 @@ xrep_find_ag_btree_roots( return error; } +#ifdef CONFIG_XFS_QUOTA +/* Update some quota flags in the superblock. */ +void +xrep_update_qflags( + struct xfs_scrub *sc, + unsigned int clear_flags, + unsigned int set_flags) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + if ((mp->m_qflags & clear_flags) == 0 && + (mp->m_qflags & set_flags) == set_flags) + goto no_update; + + mp->m_qflags &= ~clear_flags; + mp->m_qflags |= set_flags; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags &= ~clear_flags; + mp->m_sb.sb_qflags |= set_flags; + spin_unlock(&mp->m_sb_lock); + + /* + * Update the quota flags in the ondisk superblock without touching + * the summary counters. We have not quiesced inode chunk allocation, + * so we cannot coordinate with updates to the icount and ifree percpu + * counters. + */ + bp = xfs_trans_getsb(sc->tp); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); + +no_update: + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); +} + /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -685,13 +744,7 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->mp->m_qflags &= ~flag; - spin_lock(&sc->mp->m_sb_lock); - sc->mp->m_sb.sb_qflags &= ~flag; - spin_unlock(&sc->mp->m_sb_lock); - xfs_log_sb(sc->tp); - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + xrep_update_qflags(sc, flag, 0); } /* @@ -699,10 +752,10 @@ xrep_force_quotacheck( * * This function ensures that the appropriate dquots are attached to an inode. * We cannot allow the dquot code to allocate an on-disk dquot block here - * because we're already in transaction context with the inode locked. The - * on-disk dquot should already exist anyway. If the quota code signals - * corruption or missing quota information, schedule quotacheck, which will - * repair corruptions in the quota metadata. + * because we're already in transaction context. The on-disk dquot should + * already exist anyway. If the quota code signals corruption or missing quota + * information, schedule quotacheck, which will repair corruptions in the quota + * metadata. */ int xrep_ino_dqattach( @@ -710,7 +763,10 @@ xrep_ino_dqattach( { int error; - error = xfs_qm_dqattach_locked(sc->ip, false); + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + error = xfs_qm_dqattach(sc->ip); switch (error) { case -EFSBADCRC: case -EFSCORRUPTED: @@ -734,3 +790,419 @@ xrep_ino_dqattach( return error; } +#endif /* CONFIG_XFS_QUOTA */ + +/* + * Ensure that the inode being repaired is ready to handle a certain number of + * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode + * being repaired and have joined it to the scrub transaction. + */ +int +xrep_ino_ensure_extent_count( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t nextents) +{ + xfs_extnum_t max_extents; + bool inode_has_nrext64; + + inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); + max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); + if (nextents <= max_extents) + return 0; + if (inode_has_nrext64) + return -EFSCORRUPTED; + if (!xfs_has_large_extent_counts(sc->mp)) + return -EFSCORRUPTED; + + max_extents = xfs_iext_max_nextents(true, whichfork); + if (nextents > max_extents) + return -EFSCORRUPTED; + + sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return 0; +} + +/* + * Initialize all the btree cursors for an AG repair except for the btree that + * we're rebuilding. + */ +void +xrep_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + /* Set up a bnobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { + sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sa->agi_bp); + if (xfs_has_finobt(mp)) + sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && + xfs_has_rmapbt(mp)) + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && + xfs_has_reflink(mp)) + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sc->sa.pag); +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGF + * buffer. We had better get the same AGF buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagf( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agf(pag)); + + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agf_bp) { + ASSERT(bp == sc->sa.agf_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGI + * buffer. We had better get the same AGI buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagi( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agi(pag)); + + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agi_bp) { + ASSERT(bp == sc->sa.agi_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Given an active reference to a perag structure, load AG headers and cursors. + * This should only be called to scan an AG while repairing file-based metadata. + */ +int +xrep_ag_init( + struct xfs_scrub *sc, + struct xfs_perag *pag, + struct xchk_ag *sa) +{ + int error; + + ASSERT(!sa->pag); + + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp); + if (error) + return error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); + if (error) + return error; + + /* Grab our own passive reference from the caller's ref. */ + sa->pag = xfs_perag_hold(pag); + xrep_ag_btcur_init(sc, sa); + return 0; +} + +/* Reinitialize the per-AG block reservation for the AG we just fixed. */ +int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + int error; + + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(sc->sa.pag != NULL); + ASSERT(sc->ops->type == ST_PERAG); + ASSERT(sc->tp); + + sc->flags &= ~XREP_RESET_PERAG_RESV; + xfs_ag_resv_free(sc->sa.pag); + error = xfs_ag_resv_init(sc->sa.pag, sc->tp); + if (error == -ENOSPC) { + xfs_err(sc->mp, +"Insufficient free space to reset per-AG reservation for AG %u after repair.", + sc->sa.pag->pag_agno); + error = 0; + } + + return error; +} + +/* Decide if we are going to call the repair function for a scrub type. */ +bool +xrep_will_attempt( + struct xfs_scrub *sc) +{ + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + return true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + return true; + + /* Metadata is corrupt or failed cross-referencing. */ + if (xchk_needs_repair(sc->sm)) + return true; + + return false; +} + +/* Try to fix some part of a metadata inode by calling another scrubber. */ +STATIC int +xrep_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + struct xfs_scrub_subord *sub; + int error; + + /* + * Let's see if the inode needs repair. Use a subordinate scrub context + * to call the scrub and repair functions so that we can hang on to the + * resources that we already acquired instead of using the standard + * setup/teardown routines. + */ + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); + if (error) + goto out; + if (!xrep_will_attempt(&sub->sc)) + goto out; + + /* + * Repair some part of the inode. This will potentially join the inode + * to the transaction. + */ + error = sub->sc.ops->repair(&sub->sc); + if (error) + goto out; + + /* + * Finish all deferred intent items and then roll the transaction so + * that the inode will not be joined to the transaction when we exit + * the function. + */ + error = xfs_defer_finish(&sub->sc.tp); + if (error) + goto out; + error = xfs_trans_roll(&sub->sc.tp); + if (error) + goto out; + + /* + * Clear the corruption flags and re-check the metadata that we just + * repaired. + */ + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + error = sub->sc.ops->scrub(&sub->sc); + if (error) + goto out; + + /* If corruption persists, the repair has failed. */ + if (xchk_needs_repair(sub->sc.sm)) { + error = -EFSCORRUPTED; + goto out; + } +out: + xchk_scrub_free_subord(sub); + return error; +} + +/* + * Repair the ondisk forks of a metadata inode. The caller must ensure that + * sc->ip points to the metadata inode and the ILOCK is held on that inode. + * The inode must not be joined to the transaction before the call, and will + * not be afterwards. + */ +int +xrep_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool dirty = false; + int error; + + /* Repair the inode record and the data fork. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error) + return error; + + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error) + return error; + + /* Make sure the attr fork looks ok before we delete it. */ + if (xfs_inode_hasattr(sc->ip)) { + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + } + + /* Clear the reflink flag since metadata never shares. */ + if (xfs_is_reflink_inode(sc->ip)) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* Clear the attr forks since metadata shouldn't have that. */ + if (xfs_inode_hasattr(sc->ip)) { + if (!dirty) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + } + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + } + + /* + * If we modified the inode, roll the transaction but don't rejoin the + * inode to the new transaction because xrep_bmap_data can do that. + */ + if (dirty) { + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + dirty = false; + } + + return 0; +} + +/* + * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating + * a shmem file might take loks, so we cannot be in transaction context. Park + * our resources in the scrub context and let the teardown function take care + * of them at the right time. + */ +int +xrep_setup_xfbtree( + struct xfs_scrub *sc, + const char *descr) +{ + ASSERT(sc->tp == NULL); + + return xmbuf_alloc(sc->mp, descr, &sc->xmbtp); +} + +/* + * Create a dummy transaction for use in a live update hook function. This + * function MUST NOT be called from regular repair code because the current + * process' transaction is saved via the cookie. + */ +int +xrep_trans_alloc_hook_dummy( + struct xfs_mount *mp, + void **cookiep, + struct xfs_trans **tpp) +{ + int error; + + *cookiep = current->journal_info; + current->journal_info = NULL; + + error = xfs_trans_alloc_empty(mp, tpp); + if (!error) + return 0; + + current->journal_info = *cookiep; + *cookiep = NULL; + return error; +} + +/* Cancel a dummy transaction used by a live update hook function. */ +void +xrep_trans_cancel_hook_dummy( + void **cookiep, + struct xfs_trans *tp) +{ + xfs_trans_cancel(tp); + current->journal_info = *cookiep; + *cookiep = NULL; +} + +/* + * See if this buffer can pass the given ->verify_struct() function. + * + * If the buffer already has ops attached and they're not the ones that were + * passed in, we reject the buffer. Otherwise, we perform the structure test + * (note that we do not check CRCs) and return the outcome of the test. The + * buffer ops and error state are left unchanged. + */ +bool +xrep_buf_verify_struct( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + const struct xfs_buf_ops *old_ops = bp->b_ops; + xfs_failaddr_t fa; + int old_error; + + if (old_ops) { + if (old_ops != ops) + return false; + } + + old_error = bp->b_error; + bp->b_ops = ops; + fa = bp->b_ops->verify_struct(bp); + bp->b_ops = old_ops; + bp->b_error = old_error; + + return fa == NULL; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60d2a9ae5f2e..96180176c582 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,17 +28,30 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +bool xrep_will_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_roll_trans(struct xfs_scrub *sc); int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); +static inline int +xrep_trans_commit( + struct xfs_scrub *sc) +{ + int error = xfs_trans_commit(sc->tp); + + sc->tp = NULL; + return error; +} + struct xbitmap; struct xagb_bitmap; +struct xfsb_bitmap; -int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); +int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -57,8 +70,47 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); + +#ifdef CONFIG_XFS_QUOTA +void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags, + unsigned int set_flags); void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +#else +# define xrep_force_quotacheck(sc, type) ((void)0) +# define xrep_ino_dqattach(sc) (0) +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr); + +int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t nextents); +int xrep_reset_perag_resv(struct xfs_scrub *sc); +int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); +int xrep_metadata_inode_forks(struct xfs_scrub *sc); +int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); +int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); +int xrep_setup_xattr(struct xfs_scrub *sc); +int xrep_setup_directory(struct xfs_scrub *sc); +int xrep_setup_parent(struct xfs_scrub *sc); +int xrep_setup_nlinks(struct xfs_scrub *sc); +int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); +int xrep_setup_dirtree(struct xfs_scrub *sc); + +/* Repair setup functions */ +int xrep_setup_ag_allocbt(struct xfs_scrub *sc); + +struct xfs_imap; +int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); + +void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, + struct xchk_ag *sa); + +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); +int xrep_revalidate_iallocbt(struct xfs_scrub *sc); /* Metadata repairers */ @@ -67,9 +119,61 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); +int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_rmapbt(struct xfs_scrub *sc); +int xrep_refcountbt(struct xfs_scrub *sc); +int xrep_inode(struct xfs_scrub *sc); +int xrep_bmap_data(struct xfs_scrub *sc); +int xrep_bmap_attr(struct xfs_scrub *sc); +int xrep_bmap_cow(struct xfs_scrub *sc); +int xrep_nlinks(struct xfs_scrub *sc); +int xrep_fscounters(struct xfs_scrub *sc); +int xrep_xattr(struct xfs_scrub *sc); +int xrep_directory(struct xfs_scrub *sc); +int xrep_parent(struct xfs_scrub *sc); +int xrep_symlink(struct xfs_scrub *sc); +int xrep_dirtree(struct xfs_scrub *sc); + +#ifdef CONFIG_XFS_RT +int xrep_rtbitmap(struct xfs_scrub *sc); +int xrep_rtsummary(struct xfs_scrub *sc); +#else +# define xrep_rtbitmap xrep_notsupported +# define xrep_rtsummary xrep_notsupported +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +int xrep_quota(struct xfs_scrub *sc); +int xrep_quotacheck(struct xfs_scrub *sc); +#else +# define xrep_quota xrep_notsupported +# define xrep_quotacheck xrep_notsupported +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_reinit_pagf(struct xfs_scrub *sc); +int xrep_reinit_pagi(struct xfs_scrub *sc); + +int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, + struct xfs_trans **tpp); +void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); + +bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); #else +#define xrep_ino_dqattach(sc) (0) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} + static inline int xrep_attempt( struct xfs_scrub *sc, @@ -87,11 +191,67 @@ xrep_calc_ag_resblks( return 0; } +static inline int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(0); + return -EOPNOTSUPP; +} + +/* repair setup functions for no-repair */ +static inline int +xrep_setup_nothing( + struct xfs_scrub *sc) +{ + return 0; +} +#define xrep_setup_ag_allocbt xrep_setup_nothing +#define xrep_setup_ag_rmapbt xrep_setup_nothing +#define xrep_setup_ag_refcountbt xrep_setup_nothing +#define xrep_setup_xattr xrep_setup_nothing +#define xrep_setup_directory xrep_setup_nothing +#define xrep_setup_parent xrep_setup_nothing +#define xrep_setup_nlinks xrep_setup_nothing +#define xrep_setup_dirtree xrep_setup_nothing + +#define xrep_setup_inode(sc, imap) ((void)0) + +static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) +{ + return 0; +} + +#define xrep_revalidate_allocbt (NULL) +#define xrep_revalidate_iallocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported +#define xrep_iallocbt xrep_notsupported +#define xrep_rmapbt xrep_notsupported +#define xrep_refcountbt xrep_notsupported +#define xrep_inode xrep_notsupported +#define xrep_bmap_data xrep_notsupported +#define xrep_bmap_attr xrep_notsupported +#define xrep_bmap_cow xrep_notsupported +#define xrep_rtbitmap xrep_notsupported +#define xrep_quota xrep_notsupported +#define xrep_quotacheck xrep_notsupported +#define xrep_nlinks xrep_notsupported +#define xrep_fscounters xrep_notsupported +#define xrep_rtsummary xrep_notsupported +#define xrep_xattr xrep_notsupported +#define xrep_directory xrep_notsupported +#define xrep_parent xrep_notsupported +#define xrep_symlink xrep_notsupported +#define xrep_dirtree xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index d29a26ecddd6..ba5bbc3fb754 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,8 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/repair.h" /* * Set us up to scrub reverse mapping btrees. @@ -35,6 +37,14 @@ xchk_setup_ag_rmapbt( if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_rmapbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } @@ -348,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur, &irec) != NULL) { + xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -411,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ cur = sc->sa.bno_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.bno_cur) xfs_btree_del_cursor(cur, error); @@ -421,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata( cur = sc->sa.cnt_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.cnt_cur) xfs_btree_del_cursor(cur, error); @@ -446,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_INOBT: inobt, finobt */ cur = sc->sa.ino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, - XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.ino_cur) xfs_btree_del_cursor(cur, error); @@ -457,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata( if (xfs_has_finobt(sc->mp)) { cur = sc->sa.fino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, - sc->sa.agi_bp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.fino_cur) xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c new file mode 100644 index 000000000000..e8080eba37d2 --- /dev/null +++ b/fs/xfs/scrub/rmap_repair.c @@ -0,0 +1,1675 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Reverse Mapping Btree Repair + * ============================ + * + * This is the most involved of all the AG space btree rebuilds. Everywhere + * else in XFS we lock inodes and then AG data structures, but generating the + * list of rmap records requires that we be able to scan both block mapping + * btrees of every inode in the filesystem to see if it owns any extents in + * this AG. We can't tolerate any inode updates while we do this, so we + * freeze the filesystem to lock everyone else out, and grant ourselves + * special privileges to run transactions with regular background reclamation + * turned off. + * + * We also have to be very careful not to allow inode reclaim to start a + * transaction because all transactions (other than our own) will block. + * Deferred inode inactivation helps us out there. + * + * I) Reverse mappings for all non-space metadata and file data are collected + * according to the following algorithm: + * + * 1. For each fork of each inode: + * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. + * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate + * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree + * block. + * 1.3. If the incore extent map is loaded but the fork is in btree format, + * just visit the bmbt blocks to set the corresponding BMBIT areas. + * 1.4. From the incore extent map, accumulate each bmap that falls into our + * target AG. Remember, multiple bmap records can map to a single rmap + * record, so we cannot simply emit rmap records 1:1. + * 1.5. Emit rmap records for each extent in BMBIT and free it. + * 2. Create bitmaps INOBIT and ICHUNKBIT. + * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, + * and set bits in INOBIT for each btree block. If the inobt has no records + * at all, we must be careful to record its root in INOBIT. + * 4. For each block in the finobt, set the corresponding INOBIT area. + * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. + * 6. Create bitmaps REFCBIT and COWBIT. + * 7. For each CoW staging extent in the refcountbt, set the corresponding + * areas in COWBIT. + * 8. For each block in the refcountbt, set the corresponding REFCBIT area. + * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. + * A. Emit rmap for the AG headers. + * B. Emit rmap for the log, if there is one. + * + * II) The rmapbt shape and space metadata rmaps are computed as follows: + * + * 1. Count the rmaps collected in the previous step. (= NR) + * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) + * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. + * 4. Create bitmap AGBIT. + * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. + * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. + * 7. Count the extents in AGBIT. (= AGNR) + * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') + * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', + * and clear AGBIT. Go to step 5. + * A. Emit rmaps for each extent in AGBIT. + * + * III) The rmapbt is constructed and set in place as follows: + * + * 1. Sort the rmap records. + * 2. Bulk load the rmaps. + * + * IV) Reap the old btree blocks. + * + * 1. Create a bitmap OLDRMBIT. + * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. + * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. + * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are + * the parts of the AG that the rmap didn't find during its scan of the + * primary metadata and aren't known to be in the free space, which implies + * that they were the old rmapbt blocks. + * 5. Commit. + * + * We use the 'xrep_rmap' prefix for all the rmap functions. + */ + +/* Context for collecting rmaps */ +struct xrep_rmap { + /* new rmapbt information */ + struct xrep_newbt new_btree; + + /* lock for the xfbtree and xfile */ + struct mutex lock; + + /* rmap records generated from primary metadata */ + struct xfbtree rmap_btree; + + struct xfs_scrub *sc; + + /* in-memory btree cursor for the xfs_btree_bload iteration */ + struct xfs_btree_cur *mcur; + + /* Hooks into rmap update code. */ + struct xfs_rmap_hook rhook; + + /* inode scan cursor */ + struct xchk_iscan iscan; + + /* Number of non-freespace records found. */ + unsigned long long nr_records; + + /* bnobt/cntbt contribution to btreeblks */ + xfs_agblock_t freesp_btblocks; + + /* old agf_rmap_blocks counter */ + unsigned int old_rmapbt_fsbcount; +}; + +/* Set us up to repair reverse mapping btrees. */ +int +xrep_setup_ag_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); + + descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + + rr->sc = sc; + sc->buf = rr; + return 0; +} + +/* Make sure there's nothing funny about this mapping. */ +STATIC int +xrep_rmap_check_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Store a reverse-mapping record. */ +static inline int +xrep_rmap_stash( + struct xrep_rmap *rr, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + struct xfs_rmap_irec rmap = { + .rm_startblock = startblock, + .rm_blockcount = blockcount, + .rm_owner = owner, + .rm_offset = offset, + .rm_flags = flags, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *mcur; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); + error = xfs_rmap_map_raw(mcur, &rmap); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); + if (error) + goto out_abort; + + mutex_unlock(&rr->lock); + return 0; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); + return error; +} + +struct xrep_rmap_stash_run { + struct xrep_rmap *rr; + uint64_t owner; + unsigned int rmap_flags; +}; + +static int +xrep_rmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rmap_stash_run *rsr = priv; + struct xrep_rmap *rr = rsr->rr; + + return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rmap_stash_bitmap( + struct xrep_rmap *rr, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + .rmap_flags = 0, + }; + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; + + return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); +} + +/* Section (I): Finding all file and bmbt extents. */ + +/* Context for accumulating rmaps for an inode fork. */ +struct xrep_rmap_ifork { + /* + * Accumulate rmap data here to turn multiple adjacent bmaps into a + * single rmap. + */ + struct xfs_rmap_irec accum; + + /* Bitmap of bmbt blocks in this AG. */ + struct xagb_bitmap bmbt_blocks; + + struct xrep_rmap *rr; + + /* Which inode fork? */ + int whichfork; +}; + +/* Stash an rmap that we accumulated while walking an inode fork. */ +STATIC int +xrep_rmap_stash_accumulated( + struct xrep_rmap_ifork *rf) +{ + if (rf->accum.rm_blockcount == 0) + return 0; + + return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, + rf->accum.rm_blockcount, rf->accum.rm_owner, + rf->accum.rm_offset, rf->accum.rm_flags); +} + +/* Accumulate a bmbt record. */ +STATIC int +xrep_rmap_visit_bmbt( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_mount *mp = rf->rr->sc->mp; + struct xfs_rmap_irec *accum = &rf->accum; + xfs_agblock_t agbno; + unsigned int rmap_flags = 0; + int error; + + if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != + rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); + if (rf->whichfork == XFS_ATTR_FORK) + rmap_flags |= XFS_RMAP_ATTR_FORK; + if (rec->br_state == XFS_EXT_UNWRITTEN) + rmap_flags |= XFS_RMAP_UNWRITTEN; + + /* If this bmap is adjacent to the previous one, just add it. */ + if (accum->rm_blockcount > 0 && + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && + agbno == accum->rm_startblock + accum->rm_blockcount && + rmap_flags == accum->rm_flags) { + accum->rm_blockcount += rec->br_blockcount; + return 0; + } + + /* Otherwise stash the old rmap and start accumulating a new one. */ + error = xrep_rmap_stash_accumulated(rf); + if (error) + return error; + + accum->rm_startblock = agbno; + accum->rm_blockcount = rec->br_blockcount; + accum->rm_offset = rec->br_startoff; + accum->rm_flags = rmap_flags; + return 0; +} + +/* Add a btree block to the bitmap. */ +STATIC int +xrep_rmap_visit_iroot_btree_block( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); +} + +/* + * Iterate a metadata btree rooted in an inode to collect rmap records for + * anything in this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iroot_btree( + struct xrep_rmap_ifork *rf, + struct xfs_btree_cur *cur) +{ + struct xfs_owner_info oinfo; + struct xrep_rmap *rr = rf->rr; + int error; + + xagb_bitmap_init(&rf->bmbt_blocks); + + /* Record all the blocks in the btree itself. */ + error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, + XFS_BTREE_VISIT_ALL, rf); + if (error) + goto out; + + /* Emit rmaps for the btree blocks. */ + xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); + error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); + if (error) + goto out; + + /* Stash any remaining accumulated rmaps. */ + error = xrep_rmap_stash_accumulated(rf); +out: + xagb_bitmap_destroy(&rf->bmbt_blocks); + return error; +} + +/* + * Iterate the block mapping btree to collect rmap records for anything in this + * fork that matches the AG. Sets @mappings_done to true if we've scanned the + * block mappings in this fork. + */ +STATIC int +xrep_rmap_scan_bmbt( + struct xrep_rmap_ifork *rf, + struct xfs_inode *ip, + bool *mappings_done) +{ + struct xrep_rmap *rr = rf->rr; + struct xfs_btree_cur *cur; + struct xfs_ifork *ifp; + int error; + + *mappings_done = false; + ifp = xfs_ifork_ptr(ip, rf->whichfork); + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); + + if (!xfs_ifork_is_realtime(ip, rf->whichfork) && + xfs_need_iread_extents(ifp)) { + /* + * If the incore extent cache isn't loaded, scan the bmbt for + * mapping records. This avoids loading the incore extent + * tree, which will increase memory pressure at a time when + * we're trying to run as quickly as we possibly can. Ignore + * realtime extents. + */ + error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); + if (error) + goto out_cur; + + *mappings_done = true; + } + + /* Scan for the bmbt blocks, which always live on the data device. */ + error = xrep_rmap_scan_iroot_btree(rf, cur); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Iterate the in-core extent cache to collect rmap records for anything in + * this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iext( + struct xrep_rmap_ifork *rf, + struct xfs_ifork *ifp) +{ + struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; + int error; + + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + error = xrep_rmap_visit_bmbt(NULL, &rec, rf); + if (error) + return error; + } + + return xrep_rmap_stash_accumulated(rf); +} + +/* Find all the extents from a given AG in an inode fork. */ +STATIC int +xrep_rmap_scan_ifork( + struct xrep_rmap *rr, + struct xfs_inode *ip, + int whichfork) +{ + struct xrep_rmap_ifork rf = { + .accum = { .rm_owner = ip->i_ino, }, + .rr = rr, + .whichfork = whichfork, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + int error = 0; + + if (!ifp) + return 0; + + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + bool mappings_done; + + /* + * Scan the bmap btree for data device mappings. This includes + * the btree blocks themselves, even if this is a realtime + * file. + */ + error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); + if (error || mappings_done) + return error; + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + return 0; + } + + /* Scan incore extent cache if this isn't a realtime file. */ + if (xfs_ifork_is_realtime(ip, whichfork)) + return 0; + + return xrep_rmap_scan_iext(&rf, ifp); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded + * attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_rmap_scan_ilock( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* Record reverse mappings for a file. */ +STATIC int +xrep_rmap_scan_inode( + struct xrep_rmap *rr, + struct xfs_inode *ip) +{ + unsigned int lock_mode = xrep_rmap_scan_ilock(ip); + int error; + + /* Check the data fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* Check the attr fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + + /* COW fork extents are "owned" by the refcount btree. */ + + xchk_iscan_mark_visited(&rr->iscan, ip); +out_unlock: + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Section (I): Find all AG metadata extents except for free space metadata. */ + +struct xrep_rmap_inodes { + struct xrep_rmap *rr; + struct xagb_bitmap inobt_blocks; /* INOBIT */ + struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ +}; + +/* Record inode btree rmaps. */ +STATIC int +xrep_rmap_walk_inobt( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_rmap_inodes *ri = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agino_t agino; + xfs_agino_t iperhole; + unsigned int i; + int error; + + /* Record the inobt blocks. */ + error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); + if (error) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + return -EFSCORRUPTED; + + agino = irec.ir_startino; + + /* Record a non-sparse inode chunk. */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + aglen = max_t(xfs_extlen_t, 1, + XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); + + return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + } + + /* Iterate each chunk. */ + iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, + XFS_INODES_PER_HOLEMASK_BIT); + aglen = iperhole / mp->m_sb.sb_inopblock; + for (i = 0, agino = irec.ir_startino; + i < XFS_INOBT_HOLEMASK_BITS; + i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { + /* Skip holes. */ + if (irec.ir_holemask & (1 << i)) + continue; + + /* Record the inode chunk otherwise. */ + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + if (error) + return error; + } + + return 0; +} + +/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ +STATIC int +xrep_rmap_find_inode_rmaps( + struct xrep_rmap *rr) +{ + struct xrep_rmap_inodes ri = { + .rr = rr, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + xagb_bitmap_init(&ri.inobt_blocks); + xagb_bitmap_init(&ri.ichunk_blocks); + + /* + * Iterate every record in the inobt so we can capture all the inode + * chunks and the blocks in the inobt itself. + */ + error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); + if (error) + goto out_bitmap; + + /* + * Note that if there are zero records in the inobt then query_all does + * nothing and we have to account the empty inobt root manually. + */ + if (xagb_bitmap_empty(&ri.ichunk_blocks)) { + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + + error = xagb_bitmap_set(&ri.inobt_blocks, + be32_to_cpu(agi->agi_root), 1); + if (error) + goto out_bitmap; + } + + /* Scan the finobt too. */ + if (xfs_has_finobt(sc->mp)) { + error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, + sc->sa.fino_cur); + if (error) + goto out_bitmap; + } + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, + &XFS_RMAP_OINFO_INOBT); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, + &XFS_RMAP_OINFO_INODES); + +out_bitmap: + xagb_bitmap_destroy(&ri.inobt_blocks); + xagb_bitmap_destroy(&ri.ichunk_blocks); + return error; +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rmap_find_refcount_rmaps( + struct xrep_rmap *rr) +{ + struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ + struct xagb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_reflink(sc->mp)) + return 0; + + xagb_bitmap_init(&refcountbt_blocks); + xagb_bitmap_init(&cow_blocks); + + /* refcountbt */ + error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); + if (error) + goto out_bitmap; + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, + xrep_rmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, + &XFS_RMAP_OINFO_REFC); + +out_bitmap: + xagb_bitmap_destroy(&cow_blocks); + xagb_bitmap_destroy(&refcountbt_blocks); + return error; +} + +/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ +STATIC int +xrep_rmap_find_agheader_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + /* Create a record for the AG sb->agfl. */ + return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), + XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, + XFS_RMAP_OWN_FS, 0, 0); +} + +/* Generate rmaps for the log, if it's in this AG. */ +STATIC int +xrep_rmap_find_log_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + return 0; + + return xrep_rmap_stash(rr, + XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), + sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); +} + +/* Check and count all the records that we gathered. */ +STATIC int +xrep_rmap_check_record( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap *rr = priv; + int error; + + error = xrep_rmap_check_mapping(rr->sc, rec); + if (error) + return error; + + rr->nr_records++; + return 0; +} + +/* + * Generate all the reverse-mappings for this AG, a list of the old rmapbt + * blocks, and the new btreeblks count. Figure out if we have enough free + * space to reconstruct the inode btrees. The caller must clean up the lists + * if anything goes wrong. This implements section (I) above. + */ +STATIC int +xrep_rmap_find_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xchk_ag *sa = &sc->sa; + struct xfs_inode *ip; + struct xfs_btree_cur *mcur; + int error; + + /* Find all the per-AG metadata. */ + xrep_ag_btcur_init(sc, &sc->sa); + + error = xrep_rmap_find_inode_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_refcount_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_agheader_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_log_rmaps(rr); +end_agscan: + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Unlock the AG header buffers and cancel the transaction to release + * the log grant space while we scan the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + sa->agf_bp = NULL; + sa->agi_bp = NULL; + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* Iterate all AGs for inodes rmaps. */ + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { + error = xrep_rmap_scan_inode(rr, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rr->iscan); + if (error) + return error; + + /* + * Switch out for a real transaction and lock the AG headers in + * preparation for building a new tree. + */ + xchk_trans_cancel(sc); + error = xchk_setup_fs(sc); + if (error) + return error; + error = xchk_perag_drain_and_lock(sc); + if (error) + return error; + + /* + * If a hook failed to update the in-memory btree, we lack the data to + * continue the repair. + */ + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + /* + * Now that we have everything locked again, we need to count the + * number of rmap records stashed in the btree. This should reflect + * all actively-owned space in the filesystem. At the same time, check + * all our records before we start building a new btree, which requires + * a bnobt cursor. + */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + rr->nr_records = 0; + error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); + + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + xfs_btree_del_cursor(mcur, error); + + return error; +} + +/* Section (II): Reserving space for new rmapbt and setting free space bitmap */ + +struct xrep_rmap_agfl { + struct xagb_bitmap *bitmap; + xfs_agnumber_t agno; +}; + +/* Add an AGFL block to the rmap list. */ +STATIC int +xrep_rmap_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_rmap_agfl *ra = priv; + + return xagb_bitmap_set(ra->bitmap, agbno, 1); +} + +/* + * Run one round of reserving space for the new rmapbt and recomputing the + * number of blocks needed to store the previously observed rmapbt records and + * the ones we'll create for the free space metadata. When we don't need more + * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to + * true. + */ +STATIC int +xrep_rmap_try_reserve( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur, + struct xagb_bitmap *freesp_blocks, + uint64_t *blocks_reserved, + bool *done) +{ + struct xrep_rmap_agfl ra = { + .bitmap = freesp_blocks, + .agno = rr->sc->sa.pag->pag_agno, + }; + struct xfs_scrub *sc = rr->sc; + struct xrep_newbt_resv *resv, *n; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + uint64_t nr_blocks; /* RMB */ + uint64_t freesp_records; + int error; + + /* + * We're going to recompute new_btree.bload.nr_blocks at the end of + * this function to reflect however many btree blocks we need to store + * all the rmap records (including the ones that reflect the changes we + * made to support the new rmapbt blocks), so we save the old value + * here so we can decide if we've reserved enough blocks. + */ + nr_blocks = rr->new_btree.bload.nr_blocks; + + /* + * Make sure we've reserved enough space for the new btree. This can + * change the shape of the free space btrees, which can cause secondary + * interactions with the rmap records because all three space btrees + * have the same rmap owner. We'll account for all that below. + */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + nr_blocks - *blocks_reserved); + if (error) + return error; + + *blocks_reserved = rr->new_btree.bload.nr_blocks; + + /* Clear everything in the bitmap. */ + xagb_bitmap_destroy(freesp_blocks); + + /* Set all the bnobt blocks in the bitmap. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + return error; + + /* Set all the cntbt blocks in the bitmap. */ + sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); + xfs_btree_del_cursor(sc->sa.cnt_cur, error); + sc->sa.cnt_cur = NULL; + if (error) + return error; + + /* Record our new btreeblks value. */ + rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; + + /* Set all the new rmapbt blocks in the bitmap. */ + list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { + error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); + if (error) + return error; + } + + /* Set all the AGFL blocks in the bitmap. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); + if (error) + return error; + + /* Count the extents in the bitmap. */ + freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); + + /* Compute how many blocks we'll need for all the rmaps. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records + freesp_records); + if (error) + return error; + + /* We're done when we don't need more blocks. */ + *done = nr_blocks >= rr->new_btree.bload.nr_blocks; + return 0; +} + +/* + * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for + * the free space metadata. This implements section (II) above. + */ +STATIC int +xrep_rmap_reserve_space( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur) +{ + struct xagb_bitmap freesp_blocks; /* AGBIT */ + uint64_t blocks_reserved = 0; + bool done = false; + int error; + + /* Compute how many blocks we'll need for the rmaps collected so far. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(rr->sc, &error)) + return error; + + xagb_bitmap_init(&freesp_blocks); + + /* + * Iteratively reserve space for the new rmapbt and recompute the + * number of blocks needed to store the previously observed rmapbt + * records and the ones we'll create for the free space metadata. + * Finish when we don't need more blocks. + */ + do { + error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, + &blocks_reserved, &done); + if (error) + goto out_bitmap; + } while (!done); + + /* Emit rmaps for everything in the free space bitmap. */ + xrep_ag_btcur_init(rr->sc, &rr->sc->sa); + error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); + xchk_ag_btcur_free(&rr->sc->sa); + +out_bitmap: + xagb_bitmap_destroy(&freesp_blocks); + return error; +} + +/* Section (III): Building the new rmap btree. */ + +/* Update the AGF counters. */ +STATIC int +xrep_rmap_reset_counters( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_agblock_t rmap_btblocks; + + /* + * The AGF header contains extra information related to the reverse + * mapping btree, so we must update those fields here. + */ + rmap_btblocks = rr->new_btree.afake.af_blocks - 1; + agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the rmapbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_rmap_level = pag->pagf_rmap_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* Retrieve rmapbt data for bulk load. */ +STATIC int +xrep_rmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + int stat = 0; + + error = xfs_btree_increment(rr->mcur, 0, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rmap *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Custom allocation function for new rmap btrees. */ +STATIC int +xrep_rmap_alloc_vextent( + struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint) +{ + int error; + + /* + * We don't want an rmap update on the allocation, since we iteratively + * compute the OWN_AG records /after/ allocating blocks for the records + * that we already know we need to store. Therefore, fix the freelist + * with the NORMAP flag set so that we don't also try to create an rmap + * for new AGFL blocks. + */ + error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); + if (error) + return error; + + /* + * If xrep_fix_freelist fixed the freelist by moving blocks from the + * free space btrees or by removing blocks from the AGFL and queueing + * an EFI to free the block, the transaction will be dirty. This + * second case is of interest to us. + * + * Later on, we will need to compare gaps in the new recordset against + * the block usage of all OWN_AG owners in order to free the old + * btree's blocks, which means that we can't have EFIs for former AGFL + * blocks attached to the repair transaction when we commit the new + * btree. + * + * xrep_newbt_alloc_blocks guarantees this for us by calling + * xrep_defer_finish to commit anything that fix_freelist may have + * added to the transaction. + */ + return xfs_alloc_vextent_near_bno(args, alloc_hint); +} + + +/* Count the records in this btree. */ +STATIC int +xrep_rmap_count_records( + struct xfs_btree_cur *cur, + unsigned long long *nr) +{ + int running = 1; + int error; + + *nr = 0; + + error = xfs_btree_goto_left_edge(cur); + if (error) + return error; + + while (running && !(error = xfs_btree_increment(cur, 0, &running))) { + if (running) + (*nr)++; + } + + return error; +} +/* + * Use the collected rmap information to stage a new rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. This implements section (III) + * above. + */ +STATIC int +xrep_rmap_build_new_tree( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *rmap_cur; + xfs_fsblock_t fsbno; + int error; + + /* + * Preserve the old rmapbt block count so that we can adjust the + * per-AG rmapbt reservation after we commit the new btree root and + * want to dispose of the old btree blocks. + */ + rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. The new blocks are accounted to the + * rmapbt per-AG reservation, which we will adjust further after + * committing the new btree. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, + fsbno, XFS_AG_RESV_RMAPBT); + rr->new_btree.bload.get_records = xrep_rmap_get_records; + rr->new_btree.bload.claim_block = xrep_rmap_claim_block; + rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; + rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); + + /* + * Initialize @rr->new_btree, reserve space for the new rmapbt, + * and compute OWN_AG rmaps. + */ + error = xrep_rmap_reserve_space(rr, rmap_cur); + if (error) + goto err_cur; + + /* + * Count the rmapbt records again, because the space reservation + * for the rmapbt itself probably added more records to the btree. + */ + rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, + &rr->rmap_btree); + + error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); + if (error) + goto err_mcur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; + + /* + * Move the cursor to the left edge of the tree so that the first + * increment in ->get_records positions us at the first record. + */ + error = xfs_btree_goto_left_edge(rr->mcur); + if (error) + goto err_level; + + /* Add all observed rmap records. */ + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(rmap_cur, 0); + xfs_btree_del_cursor(rr->mcur, 0); + rr->mcur = NULL; + + /* + * Now that we've written the new btree to disk, we don't need to keep + * updating the in-memory btree. Abort the scan to stop live updates. + */ + xchk_iscan_abort(&rr->iscan); + + /* + * The newly committed rmap recordset includes mappings for the blocks + * that we reserved to build the new btree. If there is excess space + * reservation to be freed, the corresponding rmap records must also be + * removed. + */ + rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_rmap_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_rmap_level = 0; +err_mcur: + xfs_btree_del_cursor(rr->mcur, error); +err_cur: + xfs_btree_del_cursor(rmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Section (IV): Reaping the old btree. */ + +struct xrep_rmap_find_gaps { + struct xagb_bitmap rmap_gaps; + xfs_agblock_t next_agbno; +}; + +/* Subtract each free extent in the bnobt from the rmap gaps. */ +STATIC int +xrep_rmap_find_freesp( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + + return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, + rec->ar_blockcount); +} + +/* Record the free space we find, as part of cleaning out the btree. */ +STATIC int +xrep_rmap_find_gaps( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + int error; + + if (rec->rm_startblock > rfg->next_agbno) { + error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, + rec->rm_startblock - rfg->next_agbno); + if (error) + return error; + } + + rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* + * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make + * a list of gaps in the rmap records and a list of the extents mentioned in + * the bnobt. Any block that's in the new rmapbt gap list but not mentioned + * in the bnobt is a block from the old rmapbt and can be removed. + */ +STATIC int +xrep_rmap_remove_old_tree( + struct xrep_rmap *rr) +{ + struct xrep_rmap_find_gaps rfg = { + .next_agbno = 0, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_btree_cur *mcur; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&rfg.rmap_gaps); + + /* Compute free space from the new rmapbt. */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + + error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_bitmap; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (rfg.next_agbno < agend) { + error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, + agend - rfg.next_agbno); + if (error) + goto out_bitmap; + } + + /* Compute free space from the existing bnobt. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, + &rfg); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + goto out_bitmap; + + /* + * Free the "free" blocks that the new rmapbt knows about but the bnobt + * doesn't--these are the old rmapbt blocks. Credit the old rmapbt + * block usage count back to the per-AG rmapbt reservation (and not + * fdblocks, since the rmap btree lives in free space) to keep the + * reservation and free space accounting correct. + */ + error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, + &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); + if (error) + goto out_bitmap; + + /* + * Now that we've zapped all the old rmapbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservation. + */ + pag->pagf_repair_rmap_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; +out_bitmap: + xagb_bitmap_destroy(&rfg.rmap_gaps); + return error; +} + +static inline bool +xrep_rmapbt_want_live_update( + struct xchk_iscan *iscan, + const struct xfs_owner_info *oi) +{ + if (xchk_iscan_aborted(iscan)) + return false; + + /* + * Before unlocking the AG header to perform the inode scan, we + * recorded reverse mappings for all AG metadata except for the OWN_AG + * metadata. IOWs, the in-memory btree knows about the AG headers, the + * two inode btrees, the CoW staging extents, and the refcount btrees. + * For these types of metadata, we need to record the live updates in + * the in-memory rmap btree. + * + * However, we do not scan the free space btrees or the AGFL until we + * have re-locked the AGF and are ready to reserve space for the new + * rmap btree, so we do not want live updates for OWN_AG metadata. + */ + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) + return oi->oi_owner != XFS_RMAP_OWN_AG; + + /* Ignore updates to files that the scanner hasn't visited yet. */ + return xchk_iscan_want_live_update(iscan, oi->oi_owner); +} + +/* + * Apply a rmapbt update from the regular filesystem into our shadow btree. + * We're running from the thread that owns the AGF buffer and is generating + * the update, so we must be careful about which parts of the struct xrep_rmap + * that we change. + */ +static int +xrep_rmapbt_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_rmap_update_params *p = data; + struct xrep_rmap *rr; + struct xfs_mount *mp; + struct xfs_btree_cur *mcur; + struct xfs_trans *tp; + void *txcookie; + int error; + + rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); + mp = rr->sc->mp; + + if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) + goto out_unlock; + + trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); + if (error) + goto out_abort; + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, + p->blockcount, &p->oinfo, p->unwritten); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, tp); + if (error) + goto out_cancel; + + xrep_trans_cancel_hook_dummy(&txcookie, tp); + mutex_unlock(&rr->lock); + return NOTIFY_DONE; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, tp); + xrep_trans_cancel_hook_dummy(&txcookie, tp); +out_abort: + mutex_unlock(&rr->lock); + xchk_iscan_abort(&rr->iscan); +out_unlock: + return NOTIFY_DONE; +} + +/* Set up the filesystem scan components. */ +STATIC int +xrep_rmap_setup_scan( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + mutex_init(&rr->lock); + + /* Set up in-memory rmap btree */ + error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, + sc->sa.pag->pag_agno); + if (error) + goto out_mutex; + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &rr->iscan); + + /* + * Hook into live rmap operations so that we can update our in-memory + * btree to reflect live changes on the filesystem. Since we drop the + * AGF buffer to scan all the inodes, we need this piece to avoid + * installing a stale btree. + */ + ASSERT(sc->flags & XCHK_FSGATES_RMAP); + xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); + error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + if (error) + goto out_iscan; + return 0; + +out_iscan: + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); +out_mutex: + mutex_destroy(&rr->lock); + return error; +} + +/* Tear down scan components. */ +STATIC void +xrep_rmap_teardown( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + xchk_iscan_abort(&rr->iscan); + xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); + mutex_destroy(&rr->lock); +} + +/* Repair the rmap btree for some AG. */ +int +xrep_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr = sc->buf; + int error; + + error = xrep_rmap_setup_scan(rr); + if (error) + return error; + + /* + * Collect rmaps for everything in this AG that isn't space metadata. + * These rmaps won't change even as we try to allocate blocks. + */ + error = xrep_rmap_find_rmaps(rr); + if (error) + goto out_records; + + /* Rebuild the rmap information. */ + error = xrep_rmap_build_new_tree(rr); + if (error) + goto out_records; + + /* Kill the old tree. */ + error = xrep_rmap_remove_old_tree(rr); + if (error) + goto out_records; + +out_records: + xrep_rmap_teardown(rr); + return error; +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 0f574a1d2cb1..46583517377f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -14,18 +14,34 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bit.h" #include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/rtbitmap.h" /* Set us up with the realtime metadata locked. */ int xchk_setup_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb; int error; - error = xchk_trans_alloc(sc, 0); + rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (!rtb) + return -ENOMEM; + sc->buf = rtb; + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtbitmap(sc, rtb); + if (error) + return error; + } + + error = xchk_trans_alloc(sc, rtb->resblks); if (error) return error; @@ -33,7 +49,22 @@ xchk_setup_rtbitmap( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + + /* + * Now that we've locked the rtbitmap, we can't race with growfsrt + * trying to expand the bitmap or change the size of the rt volume. + * Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextslog = xfs_compute_rextslog(rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + } return 0; } @@ -49,12 +80,12 @@ xchk_rtbitmap_rec( { struct xfs_scrub *sc = priv; xfs_rtblock_t startblock; - xfs_rtblock_t blockcount; + xfs_filblks_t blockcount; - startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; + startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); + blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -64,21 +95,30 @@ STATIC int xchk_rtbitmap_check_extents( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_rtblock_t off; - int nmap; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; int error = 0; - for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; + if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rbmip, off, - mp->m_sb.sb_rbmblocks - off, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) break; @@ -99,12 +139,48 @@ int xchk_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb = sc->buf; int error; - /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_disk_size != - XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { - xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rtb->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* Is sb_rextslog correct? */ + if (mp->m_sb.sb_rextslog != rtb->rextslog) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is sb_rbmblocks large enough to handle the current rt volume? In no + * case can we exceed 4bn bitmap blocks since the super field is a u32. + */ + if (rtb->rbmblocks > U32_MAX) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* The bitmap file length must be aligned to an fsblock. */ + if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is the bitmap file itself large enough to handle the rt volume? + * growfsrt expands the bitmap file before updating sb_rextents, so the + * file can be larger than sb_rbmblocks. + */ + if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); return 0; } @@ -117,38 +193,33 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; -out: - return error; + return 0; } /* xref check that the extent is not free in the rtbitmap */ void xchk_xref_is_used_rt_space( struct xfs_scrub *sc, - xfs_rtblock_t fsbno, + xfs_rtblock_t rtbno, xfs_extlen_t len) { - xfs_rtblock_t startext; - xfs_rtblock_t endext; - xfs_rtblock_t extcount; + xfs_rtxnum_t startext; + xfs_rtxnum_t endext; bool is_free; int error; if (xchk_skip_xref(sc->sm)) return; - startext = fsbno; - endext = fsbno + len - 1; - do_div(startext, sc->mp->m_sb.sb_rextsize); - do_div(endext, sc->mp->m_sb.sb_rextsize); - extcount = endext - startext + 1; + startext = xfs_rtb_to_rtx(sc->mp, rtbno); + endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, - &is_free); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) goto out_unlock; if (is_free) diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h new file mode 100644 index 000000000000..85304ff019e1 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTBITMAP_H__ +#define __XFS_SCRUB_RTBITMAP_H__ + +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; + unsigned int resblks; +}; + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); +#else +# define xrep_setup_rtbitmap(sc, rtb) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c new file mode 100644 index 000000000000..0fef98e9f834 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/rtbitmap.h" + +/* Set up to repair the realtime bitmap file metadata. */ +int +xrep_setup_rtbitmap( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + + /* + * Reserve enough blocks to write out a completely new bmbt for a + * maximally fragmented bitmap file. We do not hold the rtbitmap + * ILOCK yet, so this is entirely speculative. + */ + blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rtb->resblks += blocks; + return 0; +} + +/* + * Make sure that the given range of the data fork of the realtime file is + * mapped to written blocks. The caller must ensure that the inode is joined + * to the transaction. + */ +STATIC int +xrep_rtbitmap_data_mappings( + struct xfs_scrub *sc, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t off = 0; + int error; + + ASSERT(sc->ip != NULL); + + while (off < len) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + /* + * Written extents are ok. Holes are not filled because we + * do not know the freespace information. + */ + if (xfs_bmap_is_written_extent(&map) || + map.br_startblock == HOLESTARTBLOCK) { + off = map.br_startoff + map.br_blockcount; + continue; + } + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* Make sure we're really converting an unwritten extent. */ + if (map.br_state != XFS_EXT_UNWRITTEN) { + ASSERT(map.br_state == XFS_EXT_UNWRITTEN); + return -EFSCORRUPTED; + } + + /* Make sure this block has a real zeroed extent mapped. */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff, + map.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, + 0, &map, &nmaps); + if (error) + return error; + + /* Commit new extent and all deferred work. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + off = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +/* Fix broken rt volume geometry. */ +STATIC int +xrep_rtbitmap_geometry( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + + /* Superblock fields */ + if (mp->m_sb.sb_rextents != rtb->rextents) + xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS, + rtb->rextents - mp->m_sb.sb_rextents); + + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + rtb->rbmblocks - mp->m_sb.sb_rbmblocks); + + if (mp->m_sb.sb_rextslog != rtb->rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + rtb->rextslog - mp->m_sb.sb_rextslog); + + /* Fix broken isize */ + sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size, + mp->m_sb.sb_blocksize); + + if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) + sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks); + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair the realtime bitmap file metadata. */ +int +xrep_rtbitmap( + struct xfs_scrub *sc) +{ + struct xchk_rtbitmap *rtb = sc->buf; + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + int error; + + /* Impossibly large rtbitmap means we can't touch the filesystem. */ + if (rtb->rbmblocks > U32_MAX) + return 0; + + /* + * If the size of the rt bitmap file is larger than what we reserved, + * figure out if we need to adjust the block reservation in the + * transaction. + */ + blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + if (blocks > rtb->resblks) { + error = xfs_trans_reserve_more(sc->tp, blocks, 0); + if (error) + return error; + + rtb->resblks += blocks; + } + + /* Fix inode core and forks. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Ensure no unwritten extents. */ + error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks); + if (error) + return error; + + /* Fix inconsistent bitmap geometry */ + return xrep_rtbitmap_geometry(sc, rtb); +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 7676718dac72..7c7366c98338 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -17,10 +17,14 @@ #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_sb.h" +#include "xfs_exchmaps.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/xfile.h" +#include "scrub/repair.h" +#include "scrub/tempexch.h" +#include "scrub/rtsummary.h" /* * Realtime Summary @@ -39,31 +43,44 @@ xchk_setup_rtsummary( { struct xfs_mount *mp = sc->mp; char *descr; + struct xchk_rtsummary *rts; int error; + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), + XCHK_GFP_FLAGS); + if (!rts) + return -ENOMEM; + sc->buf = rts; + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtsummary(sc, rts); + if (error) + return error; + } + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. */ descr = xchk_xfile_descr(sc, "realtime summary file"); - error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + error = xfile_create(descr, XFS_FSB_TO_B(mp, mp->m_rsumblocks), + &sc->xfile); kfree(descr); if (error) return error; - error = xchk_trans_alloc(sc, 0); + error = xchk_trans_alloc(sc, rts->resblks); if (error) return error; - /* Allocate a memory buffer for the summary comparison. */ - sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); - if (!sc->buf) - return -ENOMEM; - error = xchk_install_live_inode(sc, mp->m_rsumip); if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + /* * Locking order requires us to take the rtbitmap first. We must be * careful to unlock it ourselves when we are done with the rtbitmap @@ -72,44 +89,69 @@ xchk_setup_rtsummary( */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + + /* + * Now that we've locked the rtbitmap and rtsummary, we can't race with + * growfsrt trying to expand the summary or change the size of the rt + * volume. Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + int rextslog; + + rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rextslog = xfs_compute_rextslog(rts->rextents); + rts->rsumlevels = rextslog + 1; + rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); + rts->rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rbmblocks); + } return 0; } /* Helper functions to record suminfo words in an xfile. */ -typedef unsigned int xchk_rtsumoff_t; - static inline int xfsum_load( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - xfs_suminfo_t *info) + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo) { - return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t), + return xfile_load(sc->xfile, rawinfo, + sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } static inline int xfsum_store( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - const xfs_suminfo_t info) + xfs_rtsumoff_t sumoff, + const union xfs_suminfo_raw rawinfo) { - return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t), + return xfile_store(sc->xfile, &rawinfo, + sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } -static inline int +inline int xfsum_copyout( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - xfs_suminfo_t *info, + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words) { - return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG, + return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, sumoff << XFS_WORDLOG); } +static inline xfs_suminfo_t +xchk_rtsum_inc( + struct xfs_mount *mp, + union xfs_suminfo_raw *v) +{ + v->old += 1; + return v->old; +} + /* Update the summary file to reflect the free extent that we've accumulated. */ STATIC int xchk_rtsum_record_free( @@ -122,23 +164,24 @@ xchk_rtsum_record_free( xfs_fileoff_t rbmoff; xfs_rtblock_t rtbno; xfs_filblks_t rtlen; - xchk_rtsumoff_t offs; + xfs_rtsumoff_t offs; unsigned int lenlog; - xfs_suminfo_t v = 0; + union xfs_suminfo_raw v; + xfs_suminfo_t value; int error = 0; if (xchk_should_terminate(sc, &error)) return error; /* Compute the relevant location in the rtsum file. */ - rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext); - lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); - offs = XFS_SUMOFFS(mp, lenlog, rbmoff); + rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); + lenlog = xfs_highbit64(rec->ar_extcount); + offs = xfs_rtsumoffs(mp, lenlog, rbmoff); - rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; - rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, rtbno, rtlen)) { + if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); return -EFSCORRUPTED; } @@ -148,9 +191,9 @@ xchk_rtsum_record_free( if (error) return error; - v++; + value = xchk_rtsum_inc(sc->mp, &v); trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, - lenlog, offs, v); + lenlog, offs, value); return xfsum_store(sc, offs, v); } @@ -161,12 +204,11 @@ xchk_rtsum_compute( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - unsigned long long rtbmp_bytes; + unsigned long long rtbmp_blocks; /* If the bitmap size doesn't match the computed size, bail. */ - rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY); - if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) != - mp->m_rbmip->i_disk_size) + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); + if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) return -EFSCORRUPTED; return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, @@ -178,15 +220,29 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp; struct xfs_bmbt_irec map; - xfs_fileoff_t off; - xchk_rtsumoff_t sumoff = 0; - int nmap; + struct xfs_iext_cursor icur; + + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xchk_rtsummary *rts = sc->buf; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; + xfs_rtsumoff_t sumoff = 0; + int error = 0; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } - for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { - int error = 0; + while (off < endoff) { + int nmap = 1; if (xchk_should_terminate(sc, &error)) return error; @@ -194,8 +250,7 @@ xchk_rtsum_compare( return 0; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; @@ -205,23 +260,33 @@ xchk_rtsum_compare( return 0; } + off += map.br_blockcount; + } + + for (off = 0; off < endoff; off++) { + union xfs_suminfo_raw *ondisk_info; + /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp); + error = xfs_rtsummary_read_buf(&rts->args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ - error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize); if (error) { - xfs_trans_brelse(sc->tp, bp); + xfs_rtbuf_cache_relse(&rts->args); return error; } - if (memcmp(bp->b_addr, sc->buf, - mp->m_blockwsize << XFS_WORDLOG) != 0) + ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0); + if (memcmp(ondisk_info, rts->words, + mp->m_blockwsize << XFS_WORDLOG) != 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + xfs_rtbuf_cache_relse(&rts->args); + return error; + } - xfs_trans_brelse(sc->tp, bp); + xfs_rtbuf_cache_relse(&rts->args); sumoff += mp->m_blockwsize; } @@ -234,8 +299,43 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xchk_rtsummary *rts = sc->buf; int error = 0; + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rts->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + goto out_rbm; + } + + /* Is m_rsumlevels correct? */ + if (mp->m_rsumlevels != rts->rsumlevels) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* Is m_rsumsize correct? */ + if (mp->m_rsumblocks != rts->rsumblocks) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* The summary file length must be aligned to an fsblock. */ + if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* + * Is the summary file itself large enough to handle the rt volume? + * growfsrt expands the summary file before updating sb_rextents, so + * the file can be larger than rsumsize. + */ + if (mp->m_rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) @@ -259,7 +359,12 @@ xchk_rtsummary( error = xchk_rtsum_compare(sc); out_rbm: - /* Unlock the rtbitmap since we're done with it. */ + /* + * Unlock the rtbitmap since we're done with it. All other writers of + * the rt free space metadata grab the bitmap and summary ILOCKs in + * that order, so we're still protected against allocation activities + * even if we continue on to the repair function. + */ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h new file mode 100644 index 000000000000..e44b04cb6e2d --- /dev/null +++ b/fs/xfs/scrub/rtsummary.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTSUMMARY_H__ +#define __XFS_SCRUB_RTSUMMARY_H__ + +struct xchk_rtsummary { +#ifdef CONFIG_XFS_ONLINE_REPAIR + struct xrep_tempexch tempexch; +#endif + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + xfs_filblks_t rsumblocks; + unsigned int rsumlevels; + unsigned int resblks; + + /* suminfo position of xfile as we write buffers to disk. */ + xfs_rtsumoff_t prep_wordoff; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + +int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words); + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts); +#else +# define xrep_setup_rtsummary(sc, rts) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTSUMMARY_H__ */ diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c new file mode 100644 index 000000000000..7deeb948cb70 --- /dev/null +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/rtsummary.h" + +/* Set us up to repair the rtsummary file. */ +int +xrep_setup_rtsummary( + struct xfs_scrub *sc, + struct xchk_rtsummary *rts) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new summary file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement rtsummary and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtsummary ILOCK) and cannot ask for more reservation. + */ + blocks = mp->m_rsumblocks; + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rts->resblks += blocks; + return 0; +} + +static int +xrep_rtsummary_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtsummary *rts = data; + struct xfs_mount *mp = sc->mp; + union xfs_suminfo_raw *ondisk; + int error; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + rts->args.sumbp = bp; + ondisk = xfs_rsumblock_infoptr(&rts->args, 0); + rts->args.sumbp = NULL; + + bp->b_ops = &xfs_rtbuf_ops; + + error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); + if (error) + return error; + + rts->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); + return 0; +} + +/* Repair the realtime summary. */ +int +xrep_rtsummary( + struct xfs_scrub *sc) +{ + struct xchk_rtsummary *rts = sc->buf; + struct xfs_mount *mp = sc->mp; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(mp)) + return -EOPNOTSUPP; + + /* Walk away if we disagree on the size of the rt bitmap. */ + if (rts->rbmblocks != mp->m_sb.sb_rbmblocks) + return 0; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtsummary file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* Make sure we have space allocated for the entire summary file. */ + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rts->rsumblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the rtsummary file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rts->rsumblocks, + xrep_rtsummary_prep_buf, rts); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, XFS_FSB_TO_B(mp, rts->rsumblocks)); + if (error) + return error; + + /* + * Now exchange the contents. Nothing in repair uses the temporary + * buffer, so we can reuse it for the tempfile exchrange information. + */ + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch); + if (error) + return error; + + error = xrep_tempexch_contents(sc, &rts->tempexch); + if (error) + return error; + + /* Reset incore state and blow out the summary cache. */ + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + + mp->m_rsumlevels = rts->rsumlevels; + mp->m_rsumblocks = rts->rsumblocks; + + /* Free the old rtsummary blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33a..5c266d2842db 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -14,9 +14,14 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_errortag.h" -#include "xfs_error.h" #include "xfs_scrub.h" +#include "xfs_buf_mem.h" +#include "xfs_rmap.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -24,6 +29,8 @@ #include "scrub/health.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/tempfile.h" +#include "scrub/orphanage.h" /* * Online Scrub and Repair @@ -142,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } @@ -159,9 +178,51 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_DRAIN) xfs_drain_wait_disable(); + if (sc->flags & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_disable(); + + if (sc->flags & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_disable(); + + if (sc->flags & XCHK_FSGATES_RMAP) + xfs_rmap_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } +/* Free the resources associated with a scrub subtype. */ +void +xchk_scrub_free_subord( + struct xfs_scrub_subord *sub) +{ + struct xfs_scrub *sc = sub->parent_sc; + + ASSERT(sc->ip == sub->sc.ip); + ASSERT(sc->orphanage == sub->sc.orphanage); + ASSERT(sc->tempip == sub->sc.tempip); + + sc->sm->sm_type = sub->old_smtype; + sc->sm->sm_flags = sub->old_smflags | + (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); + sc->tp = sub->sc.tp; + + if (sub->sc.buf) { + if (sub->sc.buf_cleanup) + sub->sc.buf_cleanup(sub->sc.buf); + kvfree(sub->sc.buf); + } + if (sub->sc.xmbtp) + xmbuf_free(sub->sc.xmbtp); + if (sub->sc.xfile) + xfile_destroy(sub->sc.xfile); + + sc->ilock_flags = sub->sc.ilock_flags; + sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; + sc->temp_ilock_flags = sub->sc.temp_ilock_flags; + + kfree(sub); +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -186,6 +247,10 @@ xchk_teardown( sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); } + if (sc->xmbtp) { + xmbuf_free(sc->xmbtp); + sc->xmbtp = NULL; + } if (sc->xfile) { xfile_destroy(sc->xfile); sc->xfile = NULL; @@ -198,6 +263,8 @@ xchk_teardown( sc->buf = NULL; } + xrep_tempfile_rele(sc); + xrep_orphanage_rele(sc); xchk_fsgates_disable(sc); return error; } @@ -238,127 +305,154 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_bnobt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_cntbt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_inobt, - .repair = xrep_notsupported, + .scrub = xchk_iallocbt, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_finobt, + .scrub = xchk_iallocbt, .has = xfs_has_finobt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xchk_setup_ag_rmapbt, .scrub = xchk_rmapbt, .has = xfs_has_rmapbt, - .repair = xrep_notsupported, + .repair = xrep_rmapbt, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, .has = xfs_has_reflink, - .repair = xrep_notsupported, + .repair = xrep_refcountbt, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xchk_setup_inode, .scrub = xchk_inode, - .repair = xrep_notsupported, + .repair = xrep_inode, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_data, - .repair = xrep_notsupported, + .repair = xrep_bmap_data, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_attr, - .repair = xrep_notsupported, + .repair = xrep_bmap_attr, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_cow, - .repair = xrep_notsupported, + .repair = xrep_bmap_cow, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, .setup = xchk_setup_directory, .scrub = xchk_directory, - .repair = xrep_notsupported, + .repair = xrep_directory, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xchk_setup_xattr, .scrub = xchk_xattr, - .repair = xrep_notsupported, + .repair = xrep_xattr, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xchk_setup_symlink, .scrub = xchk_symlink, - .repair = xrep_notsupported, + .repair = xrep_symlink, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xchk_setup_parent, .scrub = xchk_parent, - .repair = xrep_notsupported, + .repair = xrep_parent, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .has = xfs_has_realtime, - .repair = xrep_notsupported, + .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .has = xfs_has_realtime, - .repair = xrep_notsupported, + .repair = xrep_rtsummary, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ .type = ST_FS, .setup = xchk_setup_fscounters, .scrub = xchk_fscounters, - .repair = xrep_notsupported, + .repair = xrep_fscounters, + }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ + .type = ST_FS, + .setup = xchk_setup_quotacheck, + .scrub = xchk_quotacheck, + .repair = xrep_quotacheck, + }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_nlinks, + }, + [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ + .type = ST_FS, + .setup = xchk_setup_fs, + .scrub = xchk_health_record, + .repair = xrep_notsupported, + }, + [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ + .type = ST_INODE, + .setup = xchk_setup_dirtree, + .scrub = xchk_dirtree, + .has = xfs_has_parent, + .repair = xrep_dirtree, }, }; @@ -464,8 +558,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) } #endif /* CONFIG_XFS_ONLINE_REPAIR */ +/* + * Create a new scrub context from an existing one, but with a different scrub + * type. + */ +struct xfs_scrub_subord * +xchk_scrub_create_subord( + struct xfs_scrub *sc, + unsigned int subtype) +{ + struct xfs_scrub_subord *sub; + + sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); + if (!sub) + return ERR_PTR(-ENOMEM); + + sub->old_smtype = sc->sm->sm_type; + sub->old_smflags = sc->sm->sm_flags; + sub->parent_sc = sc; + memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); + sub->sc.ops = &meta_scrub_ops[subtype]; + sub->sc.sm->sm_type = subtype; + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sub->sc.buf = NULL; + sub->sc.buf_cleanup = NULL; + sub->sc.xfile = NULL; + sub->sc.xmbtp = NULL; + + return sub; +} + /* Dispatch metadata scrubbing. */ -int +STATIC int xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) @@ -507,6 +631,7 @@ xfs_scrub_metadata( sc->sm = sm; sc->ops = &meta_scrub_ops[sm->sm_type]; sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); + sc->relax = INIT_XCHK_RELAX; retry_op: /* * When repairs are allowed, prevent freezing or readonly remount while @@ -531,7 +656,10 @@ retry_op: /* Scrub for errors. */ check_start = xchk_stats_now(); - error = sc->ops->scrub(sc); + if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) + error = sc->ops->repair_eval(sc); + else + error = sc->ops->scrub(sc); run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; @@ -542,23 +670,12 @@ retry_op: xchk_update_health(sc); - if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix = xchk_needs_repair(sc->sm); - - /* Userspace asked us to rebuild the structure regardless. */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) - needs_fix = true; - - /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - needs_fix = true; - + if (xchk_could_repair(sc)) { /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ - if (!needs_fix) { + if (!xrep_will_attempt(sc)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } @@ -618,3 +735,221 @@ try_harder: run.retries++; goto retry_op; } + +/* Scrub one aspect of one piece of metadata. */ +int +xfs_ioc_scrub_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_metadata scrub; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&scrub, arg, sizeof(scrub))) + return -EFAULT; + + error = xfs_scrub_metadata(file, &scrub); + if (error) + return error; + + if (copy_to_user(arg, &scrub, sizeof(scrub))) + return -EFAULT; + + return 0; +} + +/* Decide if there have been any scrub failures up to this point. */ +static inline int +xfs_scrubv_check_barrier( + struct xfs_mount *mp, + const struct xfs_scrub_vec *vectors, + const struct xfs_scrub_vec *stop_vec) +{ + const struct xfs_scrub_vec *v; + __u32 failmask; + + failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; + + for (v = vectors; v < stop_vec; v++) { + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) + continue; + + /* + * Runtime errors count as a previous failure, except the ones + * used to ask userspace to retry. + */ + switch (v->sv_ret) { + case -EBUSY: + case -ENOENT: + case -EUSERS: + case 0: + break; + default: + return -ECANCELED; + } + + /* + * If any of the out-flags on the scrub vector match the mask + * that was set on the barrier vector, that's a previous fail. + */ + if (v->sv_flags & failmask) + return -ECANCELED; + } + + return 0; +} + +/* + * If the caller provided us with a nonzero inode number that isn't the ioctl + * file, try to grab a reference to it to eliminate all further untrusted inode + * lookups. If we can't get the inode, let each scrub function try again. + */ +STATIC struct xfs_inode * +xchk_scrubv_open_by_handle( + struct xfs_mount *mp, + const struct xfs_scrub_vec_head *head) +{ + struct xfs_trans *tp; + struct xfs_inode *ip; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return NULL; + + error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); + xfs_trans_cancel(tp); + if (error) + return NULL; + + if (VFS_I(ip)->i_generation != head->svh_gen) { + xfs_irele(ip); + return NULL; + } + + return ip; +} + +/* Vectored scrub implementation to reduce ioctl calls. */ +int +xfs_ioc_scrubv_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_vec_head head; + struct xfs_scrub_vec_head __user *uhead = arg; + struct xfs_scrub_vec *vectors; + struct xfs_scrub_vec __user *uvectors; + struct xfs_inode *ip_in = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip_in->i_mount; + struct xfs_inode *handle_ip = NULL; + struct xfs_scrub_vec *v; + size_t vec_bytes; + unsigned int i; + int error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&head, uhead, sizeof(head))) + return -EFAULT; + + if (head.svh_reserved) + return -EINVAL; + if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) + return -EINVAL; + if (head.svh_nr == 0) + return 0; + + vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); + if (vec_bytes > PAGE_SIZE) + return -ENOMEM; + + uvectors = u64_to_user_ptr(head.svh_vectors); + vectors = memdup_user(uvectors, vec_bytes); + if (IS_ERR(vectors)) + return PTR_ERR(vectors); + + trace_xchk_scrubv_start(ip_in, &head); + + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + if (v->sv_reserved) { + error = -EINVAL; + goto out_free; + } + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && + (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { + error = -EINVAL; + goto out_free; + } + + trace_xchk_scrubv_item(mp, &head, i, v); + } + + /* + * If the caller wants us to do a scrub-by-handle and the file used to + * call the ioctl is not the same file, load the incore inode and pin + * it across all the scrubv actions to avoid repeated UNTRUSTED + * lookups. The reference is not passed to deeper layers of scrub + * because each scrubber gets to decide its own strategy and return + * values for getting an inode. + */ + if (head.svh_ino && head.svh_ino != ip_in->i_ino) + handle_ip = xchk_scrubv_open_by_handle(mp, &head); + + /* Run all the scrubbers. */ + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + struct xfs_scrub_metadata sm = { + .sm_type = v->sv_type, + .sm_flags = v->sv_flags, + .sm_ino = head.svh_ino, + .sm_gen = head.svh_gen, + .sm_agno = head.svh_agno, + }; + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { + v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); + if (v->sv_ret) { + trace_xchk_scrubv_barrier_fail(mp, &head, i, v); + break; + } + + continue; + } + + v->sv_ret = xfs_scrub_metadata(file, &sm); + v->sv_flags = sm.sm_flags; + + trace_xchk_scrubv_outcome(mp, &head, i, v); + + if (head.svh_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + head.svh_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out_free; + } + } + + if (copy_to_user(uvectors, vectors, vec_bytes) || + copy_to_user(uhead, &head, sizeof(head))) { + error = -EFAULT; + goto out_free; + } + +out_free: + if (handle_ip) + xfs_irele(handle_ip); + kfree(vectors); + return error; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 869a10fe9d7d..5993fcaffb2c 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,49 @@ struct xfs_scrub; +struct xchk_relax { + unsigned long next_resched; + unsigned int resched_nr; + bool interruptible; +}; + +/* Yield to the scheduler at most 10x per second. */ +#define XCHK_RELAX_NEXT (jiffies + (HZ / 10)) + +#define INIT_XCHK_RELAX \ + (struct xchk_relax){ \ + .next_resched = XCHK_RELAX_NEXT, \ + .resched_nr = 0, \ + .interruptible = true, \ + } + +/* + * Relax during a scrub operation and exit if there's a fatal signal pending. + * + * If preemption is disabled, we need to yield to the scheduler every now and + * then so that we don't run afoul of the soft lockup watchdog or RCU stall + * detector. cond_resched calls are somewhat expensive (~5ns) so we want to + * ratelimit this to 10x per second. Amortize the cost of the other checks by + * only doing it once every 100 calls. + */ +static inline int xchk_maybe_relax(struct xchk_relax *widget) +{ + /* Amortize the cost of scheduling and checking signals. */ + if (likely(++widget->resched_nr < 100)) + return 0; + widget->resched_nr = 0; + + if (unlikely(widget->next_resched <= jiffies)) { + cond_resched(); + widget->next_resched = XCHK_RELAX_NEXT; + } + + if (widget->interruptible && fatal_signal_pending(current)) + return -EINTR; + + return 0; +} + /* * Standard flags for allocating memory within scrub. NOFS context is * configured by the process allocation scope. Scrub and repair must be able @@ -42,6 +85,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_mount *); @@ -98,9 +149,20 @@ struct xfs_scrub { /* xfile used by the scrubbers; freed at teardown. */ struct xfile *xfile; + /* buffer target for in-memory btrees; also freed at teardown. */ + struct xfs_buftarg *xmbtp; + /* Lock flags for @ip. */ uint ilock_flags; + /* The orphanage, for stashing files that have lost their parent. */ + uint orphanage_ilock_flags; + struct xfs_inode *orphanage; + + /* A temporary file on this filesystem, for staging new metadata. */ + struct xfs_inode *tempip; + uint temp_ilock_flags; + /* See the XCHK/XREP state flags below. */ unsigned int flags; @@ -111,6 +173,9 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* State tracking for single-AG operations. */ struct xchk_ag sa; }; @@ -120,6 +185,10 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ +#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ +#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */ +#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* @@ -128,7 +197,44 @@ struct xfs_scrub { * features are gated off via dynamic code patching, which is why the state * must be enabled during scrub setup and can only be torn down afterwards. */ -#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ + XCHK_FSGATES_QUOTA | \ + XCHK_FSGATES_DIRENTS | \ + XCHK_FSGATES_RMAP) + +struct xfs_scrub_subord { + struct xfs_scrub sc; + struct xfs_scrub *parent_sc; + unsigned int old_smtype; + unsigned int old_smflags; +}; + +struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc, + unsigned int subtype); +void xchk_scrub_free_subord(struct xfs_scrub_subord *sub); + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xchk_should_terminate( + struct xfs_scrub *sc, + int *error) +{ + if (xchk_maybe_relax(&sc->relax)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +static inline int xchk_nothing(struct xfs_scrub *sc) +{ + return -ENOENT; +} /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); @@ -136,10 +242,8 @@ int xchk_superblock(struct xfs_scrub *sc); int xchk_agf(struct xfs_scrub *sc); int xchk_agfl(struct xfs_scrub *sc); int xchk_agi(struct xfs_scrub *sc); -int xchk_bnobt(struct xfs_scrub *sc); -int xchk_cntbt(struct xfs_scrub *sc); -int xchk_inobt(struct xfs_scrub *sc); -int xchk_finobt(struct xfs_scrub *sc); +int xchk_allocbt(struct xfs_scrub *sc); +int xchk_iallocbt(struct xfs_scrub *sc); int xchk_rmapbt(struct xfs_scrub *sc); int xchk_refcountbt(struct xfs_scrub *sc); int xchk_inode(struct xfs_scrub *sc); @@ -150,31 +254,23 @@ int xchk_directory(struct xfs_scrub *sc); int xchk_xattr(struct xfs_scrub *sc); int xchk_symlink(struct xfs_scrub *sc); int xchk_parent(struct xfs_scrub *sc); +int xchk_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); #else -static inline int -xchk_rtbitmap(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_rtsummary(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_rtbitmap xchk_nothing +# define xchk_rtsummary xchk_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); +int xchk_quotacheck(struct xfs_scrub *sc); #else -static inline int -xchk_quota(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_quota xchk_nothing +# define xchk_quotacheck xchk_nothing #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index 82499270e20b..7996c2335476 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -77,6 +77,9 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", + [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", + [XFS_SCRUB_TYPE_NLINKS] = "nlinks", + [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 38708fb9a5d7..c848bcc07cd5 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -10,22 +10,36 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_symlink.h" +#include "xfs_health.h" +#include "xfs_symlink_remote.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/health.h" +#include "scrub/repair.h" /* Set us up to scrub a symbolic link. */ int xchk_setup_symlink( struct xfs_scrub *sc) { + unsigned int resblks = 0; + int error; + /* Allocate the buffer without the inode lock held. */ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; - return xchk_setup_inode_contents(sc, 0); + if (xchk_could_repair(sc)) { + error = xrep_setup_symlink(sc, &resblks); + if (error) + return error; + } + + return xchk_setup_inode_contents(sc, resblks); } /* Symbolic links. */ @@ -41,29 +55,37 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { if (len > xfs_inode_data_fork_size(ip) || - len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) + len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Remote symlink; must read the contents. */ - error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + error = xfs_symlink_remote_read(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); -out: - return error; + + /* If a remote symlink is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + return 0; } diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c new file mode 100644 index 000000000000..953ce7be78dc --- /dev/null +++ b/fs/xfs/scrub/symlink_repair.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_symlink.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_symlink_remote.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" +#include "scrub/health.h" + +/* + * Symbolic Link Repair + * ==================== + * + * We repair symbolic links by reading whatever target data we can find, up to + * the first NULL byte. If the recovered target strlen matches i_size, then + * we rewrite the target. In all other cases, we replace the target with an + * overly long string that cannot possibly resolve. The new target is written + * into a private hidden temporary file, and then a file contents exchange + * commits the new symlink target to the file being repaired. + */ + +/* Set us up to repair the symlink file. */ +int +xrep_setup_symlink( + struct xfs_scrub *sc, + unsigned int *resblks) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFLNK); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new symlink file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement symlink and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * symlink ILOCK) and cannot ask for more reservation. + */ + blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + *resblks += blocks; + return 0; +} + +/* + * Try to salvage the pathname from remote blocks. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_remote( + struct xfs_scrub *sc) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_inode *ip = sc->ip; + struct xfs_buf *bp; + char *target_buf = sc->buf; + xfs_failaddr_t fa; + xfs_filblks_t fsblocks; + xfs_daddr_t d; + loff_t len; + loff_t offset = 0; + unsigned int byte_cnt; + bool magic_ok; + bool hdr_ok; + int n; + int nmaps = XFS_SYMLINK_MAPS; + int error; + + /* We'll only read until the buffer is full. */ + len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN); + fsblocks = xfs_symlink_blocks(sc->mp, len); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + return error; + + for (n = 0; n < nmaps; n++) { + struct xfs_dsymlink_hdr *dsl; + + d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock); + + /* Read the rmt block. We'll run the verifiers manually. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount), + 0, &bp, NULL); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + /* How many bytes do we expect to get out of this buffer? */ + byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount); + byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt); + byte_cnt = min_t(unsigned int, byte_cnt, len); + + /* + * See if the verifiers accept this block. We're willing to + * salvage if the if the offset/byte/ino are ok and either the + * verifier passed or the magic is ok. Anything else and we + * stop dead in our tracks. + */ + fa = bp->b_ops->verify_struct(bp); + dsl = bp->b_addr; + magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC); + hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp); + if (!hdr_ok || (fa != NULL && !magic_ok)) + break; + + memcpy(target_buf + offset, dsl + 1, byte_cnt); + + len -= byte_cnt; + offset += byte_cnt; + } + return offset; +} + +/* + * Try to salvage an inline symlink's contents. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_inline( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = sc->ip; + char *target_buf = sc->buf; + char *old_target; + struct xfs_ifork *ifp; + unsigned int nr; + + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + if (!ifp->if_data) + return 0; + + /* + * If inode repair zapped the link target, pretend that we didn't find + * any bytes at all so that we can replace the (now totally lost) link + * target with a warning message. + */ + old_target = ifp->if_data; + if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) && + sc->ip->i_disk_size == 1 && old_target[0] == '?') + return 0; + + nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + strncpy(target_buf, ifp->if_data, nr); + return nr; +} + +#define DUMMY_TARGET \ + "The target of this symbolic link could not be recovered at all and " \ + "has been replaced with this explanatory message. To avoid " \ + "accidentally pointing to an existing file path, this message is " \ + "longer than the maximum supported file name length. That is an " \ + "acceptable length for a symlink target on XFS but will produce " \ + "File Name Too Long errors if resolved." + +/* Salvage whatever we can of the target. */ +STATIC int +xrep_symlink_salvage( + struct xfs_scrub *sc) +{ + char *target_buf = sc->buf; + ssize_t buflen = 0; + + BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX); + + /* + * Salvage the target if there weren't any corruption problems observed + * while scanning it. + */ + if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) + buflen = xrep_symlink_salvage_inline(sc); + else + buflen = xrep_symlink_salvage_remote(sc); + if (buflen < 0) + return buflen; + + /* + * NULL-terminate the buffer because the ondisk target does not + * do that for us. If salvage didn't find the exact amount of + * data that we expected to find, don't salvage anything. + */ + target_buf[buflen] = 0; + if (strlen(target_buf) != sc->ip->i_disk_size) + buflen = 0; + } + + /* + * Change an empty target into a dummy target and clear the symlink + * target zapped flag. + */ + if (buflen == 0) { + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + sprintf(target_buf, DUMMY_TARGET); + } + + trace_xrep_symlink_salvage_target(sc->ip, target_buf, + strlen(target_buf)); + return 0; +} + +STATIC void +xrep_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL); + + if (!xfs_has_crc(sc->mp)) + return; + + dsl->sl_owner = cpu_to_be64(sc->ip->i_ino); + xfs_trans_log_buf(tp, bp, 0, + sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); +} + +/* + * Prepare both links' data forks for an exchange. Promote the tempfile from + * local format to extents format, and if the file being repaired has a short + * format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_symlink_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the temp link is in shortform format, convert that to a remote + * target so that we can use the atomic mapping exchange. + */ + if (temp_local) { + int logflags = XFS_ILOG_CORE; + + error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1, + &logflags, XFS_DATA_FORK, + xrep_symlink_local_to_remote, + sc); + if (error) + return error; + + xfs_trans_log_inode(sc->tp, sc->ip, 0); + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* Exchange the temporary symlink's data fork with the one being repaired. */ +STATIC int +xrep_symlink_swap( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx = sc->buf; + bool ip_local, temp_local; + int error; + + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both links have a local format data fork and the rebuilt + * remote data would fit in the repaired file's data fork, copy the + * contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return 0; + } + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_symlink_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Free all the remote blocks and reset the data fork. The caller must join + * the inode to the transaction. This function returns with the inode joined + * to a clean scrub transaction. + */ +STATIC int +xrep_symlink_reset_fork( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the remote target buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_symlink_reset_fork(sc->tempip); + + /* Reset the temp symlink target to dummy content. */ + xfs_idestroy_fork(ifp); + return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino, + "?", 1, 0, 0); +} + +/* + * Reinitialize a link target. Caller must ensure the inode is joined to + * the transaction. + */ +STATIC int +xrep_symlink_rebuild( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx; + char *target_buf = sc->buf; + xfs_fsblock_t fs_blocks; + unsigned int target_len; + unsigned int resblks; + int error; + + /* How many blocks do we need? */ + target_len = strlen(target_buf); + ASSERT(target_len != 0); + if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN) + return -EFSCORRUPTED; + + trace_xrep_symlink_rebuild(sc->ip); + + /* + * In preparation to write the new symlink target to the temporary + * file, drop the ILOCK of the file being repaired (it shouldn't be + * joined) and take the ILOCK of the temporary file. + * + * The VFS does not take the IOLOCK while reading a symlink (and new + * symlinks are hidden with INEW until they've been written) so it's + * possible that a readlink() could see the old corrupted contents + * while we're doing this. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Reserve resources to reinitialize the target. We're allowed to + * exceed file quota to repair inconsistent metadata, though this is + * unlikely. + */ + fs_blocks = xfs_symlink_blocks(sc->mp, target_len); + resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks); + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0, + true); + if (error) + return error; + + /* Erase the dummy target set up by the tempfile initialization. */ + xfs_idestroy_fork(&sc->tempip->i_df); + sc->tempip->i_df.if_bytes = 0; + sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* Write the salvaged target to the temporary link. */ + error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino, + target_buf, target_len, fs_blocks, resblks); + if (error) + return error; + + /* + * Commit the repair transaction so that we can use the atomic mapping + * exchange functions to compute the correct block reservations and + * re-lock the inodes. + */ + target_buf = NULL; + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + xrep_tempfile_iunlock(sc); + + /* + * We're done with the temporary buffer, so we can reuse it for the + * tempfile contents exchange information. + */ + tx = sc->buf; + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx); + if (error) + return error; + + /* + * Exchange the temp link's data fork with the file being repaired. + * This recreates the transaction and takes the ILOCKs of the file + * being repaired and the temporary file. + */ + error = xrep_symlink_swap(sc); + if (error) + return error; + + /* + * Release the old symlink blocks and reset the data fork of the temp + * link to an empty shortform link. This is the last repair action we + * perform on the symlink, so we don't need to clean the transaction. + */ + return xrep_symlink_reset_fork(sc); +} + +/* Repair a symbolic link. */ +int +xrep_symlink( + struct xfs_scrub *sc) +{ + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_symlink_salvage(sc); + if (error) + return error; + + /* Now reset the target. */ + error = xrep_symlink_rebuild(sc); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h new file mode 100644 index 000000000000..995ba187c5aa --- /dev/null +++ b/fs/xfs/scrub/tempexch.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPEXCH_H__ +#define __XFS_SCRUB_TEMPEXCH_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +struct xrep_tempexch { + struct xfs_exchmaps_req req; +}; + +int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); +int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); + +int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti); +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPEXCH_H__ */ diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c new file mode 100644 index 000000000000..177f922acfaf --- /dev/null +++ b/fs/xfs/scrub/tempfile.c @@ -0,0 +1,852 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_defer.h" +#include "xfs_symlink_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" + +/* + * Create a temporary file for reconstructing metadata, with the intention of + * atomically exchanging the temporary file's contents with the file that's + * being repaired. + */ +int +xrep_tempfile_create( + struct xfs_scrub *sc, + uint16_t mode) +{ + struct xfs_icreate_args args = { + .pip = sc->mp->m_rootip, + .mode = mode, + .flags = XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE, + }; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = NULL; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; + struct xfs_trans_res *tres; + struct xfs_inode *dp = mp->m_rootip; + xfs_ino_t ino; + unsigned int resblks; + bool is_dir = S_ISDIR(mode); + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) + return -EROFS; + + ASSERT(sc->tp == NULL); + ASSERT(sc->tempip == NULL); + + /* + * Make sure that we have allocated dquot(s) on disk. The temporary + * inode should be completely root owned so that we don't fail due to + * quota limits. + */ + error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + resblks = xfs_mkdir_space_res(mp, 0); + tres = &M_RES(mp)->tr_mkdir; + } else { + resblks = XFS_IALLOC_SPACE_RES(mp); + tres = &M_RES(mp)->tr_create_tmpfile; + } + + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); + if (error) + goto out_release_dquots; + + /* Allocate inode, set up directory. */ + error = xfs_dialloc(&tp, &args, &ino); + if (error) + goto out_trans_cancel; + error = xfs_icreate(tp, ino, &args, &sc->tempip); + if (error) + goto out_trans_cancel; + + /* We don't touch file data, so drop the realtime flags. */ + sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); + + /* + * Mark our temporary file as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files. + * The file should never be exposed to userspace. + */ + VFS_I(sc->tempip)->i_flags |= S_PRIVATE; + VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; + + if (is_dir) { + error = xfs_dir_init(tp, sc->tempip, dp); + if (error) + goto out_trans_cancel; + } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) { + /* + * Initialize the temporary symlink with a meaningless target + * that won't trip the verifiers. Repair must rewrite the + * target with meaningful content before swapping with the file + * being repaired. A single-byte target will not write a + * remote target block, so the owner is irrelevant. + */ + error = xfs_symlink_write_target(tp, sc->tempip, + sc->tempip->i_ino, ".", 1, 0, 0); + if (error) + goto out_trans_cancel; + } + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); + + /* + * Put our temp file on the unlinked list so it's purged automatically. + * All file-based metadata being reconstructed using this file must be + * atomically exchanged with the original file because the contents + * here will be purged when the inode is dropped or log recovery cleans + * out the unlinked list. + */ + error = xfs_iunlink(tp, sc->tempip); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + + trace_xrep_tempfile_create(sc); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + /* Finish setting up the incore / vfs context. */ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_setup_iops(sc->tempip); + xfs_finish_inode_setup(sc->tempip); + + sc->temp_ilock_flags = 0; + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (sc->tempip) { + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_finish_inode_setup(sc->tempip); + xchk_irele(sc, sc->tempip); + } +out_release_dquots: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +/* Take IOLOCK_EXCL on the temporary file, maybe. */ +bool +xrep_tempfile_iolock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + return true; + } + + return false; +} + +/* + * Take the temporary file's IOLOCK while holding a different inode's IOLOCK. + * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock + * to avoid deadlocks and lockdep complaints. + */ +int +xrep_tempfile_iolock_polled( + struct xfs_scrub *sc) +{ + int error = 0; + + while (!xrep_tempfile_iolock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + return 0; +} + +/* Release IOLOCK_EXCL on the temporary file. */ +void +xrep_tempfile_iounlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; +} + +/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ +void +xrep_tempfile_ilock( + struct xfs_scrub *sc) +{ + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); +} + +/* Try to grab ILOCK_EXCL on the temporary file. */ +bool +xrep_tempfile_ilock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + return true; + } + + return false; +} + +/* Unlock ILOCK_EXCL on the temporary file after an update. */ +void +xrep_tempfile_iunlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; +} + +/* + * Begin the process of making changes to both the file being scrubbed and + * the temporary file by taking ILOCK_EXCL on both. + */ +void +xrep_tempfile_ilock_both( + struct xfs_scrub *sc) +{ + xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; +} + +/* Unlock ILOCK_EXCL on both files. */ +void +xrep_tempfile_iunlock_both( + struct xfs_scrub *sc) +{ + xrep_tempfile_iunlock(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); +} + +/* Release the temporary file. */ +void +xrep_tempfile_rele( + struct xfs_scrub *sc) +{ + if (!sc->tempip) + return; + + if (sc->temp_ilock_flags) { + xfs_iunlock(sc->tempip, sc->temp_ilock_flags); + sc->temp_ilock_flags = 0; + } + + xchk_irele(sc, sc->tempip); + sc->tempip = NULL; +} + +/* + * Make sure that the given range of the data fork of the temporary file is + * mapped to written blocks. The caller must ensure that both inodes are + * joined to the transaction. + */ +int +xrep_tempfile_prealloc( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t end = off + len; + int error; + + ASSERT(sc->tempip != NULL); + ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); + + for (; off < end; off = map.br_startoff + map.br_blockcount) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + if (xfs_bmap_is_written_extent(&map)) + continue; + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* + * Make sure this block has a real zeroed extent allocated to + * it. + */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, + &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); + + /* Commit new extent and all deferred work. */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + return 0; +} + +/* + * Write data to each block of a file. The given range of the tempfile's data + * fork must already be populated with written extents. + */ +int +xrep_tempfile_copyin( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len, + xrep_tempfile_copyin_fn prep_fn, + void *data) +{ + LIST_HEAD(buffers_list); + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_fileoff_t flush_mask; + xfs_fileoff_t end = off + len; + loff_t pos = XFS_FSB_TO_B(mp, off); + int error = 0; + + ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); + + /* Flush buffers to disk every 512K */ + flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; + + for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { + struct xfs_bmbt_irec map; + int nmaps = 1; + + /* Read block mapping for this file block. */ + error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); + if (error) + goto out_err; + if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { + error = -EFSCORRUPTED; + goto out_err; + } + + /* Get the metadata buffer for this offset in the file. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp); + if (error) + goto out_err; + + trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); + + /* Read in a block's worth of data from the xfile. */ + error = prep_fn(sc, bp, data); + if (error) { + xfs_trans_brelse(sc->tp, bp); + goto out_err; + } + + /* Queue buffer, and flush if we have too much dirty data. */ + xfs_buf_delwri_queue_here(bp, &buffers_list); + xfs_trans_brelse(sc->tp, bp); + + if (!(off & flush_mask)) { + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + } + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + error = -EIO; + goto out_err; + } + + return 0; + +out_err: + xfs_buf_delwri_cancel(&buffers_list); + return error; +} + +/* + * Set the temporary file's size. Caller must join the tempfile to the scrub + * transaction and is responsible for adjusting block mappings as needed. + */ +int +xrep_tempfile_set_isize( + struct xfs_scrub *sc, + unsigned long long isize) +{ + if (sc->tempip->i_disk_size == isize) + return 0; + + sc->tempip->i_disk_size = isize; + i_size_write(VFS_I(sc->tempip), isize); + return xrep_tempfile_roll_trans(sc); +} + +/* + * Roll a repair transaction involving the temporary file. Caller must join + * both the temporary file and the file being scrubbed to the transaction. + * This function return with both inodes joined to a new scrub transaction, + * or the usual negative errno. + */ +int +xrep_tempfile_roll_trans( + struct xfs_scrub *sc) +{ + int error; + + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); + error = xrep_roll_trans(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + return 0; +} + +/* + * Fill out the mapping exchange request in preparation for atomically + * committing the contents of a metadata file that we've rebuilt in the temp + * file. + */ +STATIC int +xrep_tempexch_prep_request( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + + memset(tx, 0, sizeof(struct xrep_tempexch)); + + /* COW forks don't exist on disk. */ + if (whichfork == XFS_COW_FORK) { + ASSERT(0); + return -EINVAL; + } + + /* Both files should have the relevant forks. */ + if (!xfs_ifork_ptr(sc->ip, whichfork) || + !xfs_ifork_ptr(sc->tempip, whichfork)) { + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); + return -EINVAL; + } + + /* Exchange all mappings in both forks. */ + req->ip1 = sc->tempip; + req->ip2 = sc->ip; + req->startoff1 = 0; + req->startoff2 = 0; + switch (whichfork) { + case XFS_ATTR_FORK: + req->flags |= XFS_EXCHMAPS_ATTR_FORK; + break; + case XFS_DATA_FORK: + /* Always exchange sizes when exchanging data fork mappings. */ + req->flags |= XFS_EXCHMAPS_SET_SIZES; + break; + } + req->blockcount = XFS_MAX_FILEOFF; + + return 0; +} + +/* + * Fill out the mapping exchange resource estimation structures in preparation + * for exchanging the contents of a metadata file that we've rebuilt in the + * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files. + */ +STATIC int +xrep_tempexch_estimate( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + struct xfs_ifork *ifp; + struct xfs_ifork *tifp; + int whichfork = xfs_exchmaps_reqfork(req); + int state = 0; + + /* + * The exchmaps code only knows how to exchange file fork space + * mappings. Any fork data in local format must be promoted to a + * single block before the exchange can take place. + */ + ifp = xfs_ifork_ptr(sc->ip, whichfork); + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 1; + + tifp = xfs_ifork_ptr(sc->tempip, whichfork); + if (tifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 2; + + switch (state) { + case 0: + /* Both files have mapped extents; use the regular estimate. */ + return xfs_exchrange_estimate(req); + case 1: + /* + * The file being repaired is in local format, but the temp + * file has mapped extents. To perform the exchange, the file + * being repaired must have its shorform data converted to an + * ondisk block so that the forks will be in extents format. + * We need one resblk for the conversion; the number of + * exchanges is (worst case) the temporary file's extent count + * plus the block we converted. + */ + req->ip1_bcount = sc->tempip->i_nblocks; + req->ip2_bcount = 1; + req->nr_exchanges = 1 + tifp->if_nextents; + req->resblks = 1; + break; + case 2: + /* + * The temporary file is in local format, but the file being + * repaired has mapped extents. To perform the exchange, the + * temp file must have its shortform data converted to an + * ondisk block, and the fork changed to extents format. We + * need one resblk for the conversion; the number of exchanges + * is (worst case) the extent count of the file being repaired + * plus the block we converted. + */ + req->ip1_bcount = 1; + req->ip2_bcount = sc->ip->i_nblocks; + req->nr_exchanges = 1 + ifp->if_nextents; + req->resblks = 1; + break; + case 3: + /* + * Both forks are in local format. To perform the exchange, + * both files must have their shortform data converted to + * fsblocks, and both forks must be converted to extents + * format. We need two resblks for the two conversions, and + * the number of exchanges is 1 since there's only one block at + * fileoff 0. Presumably, the caller could not exchange the + * two inode fork areas directly. + */ + req->ip1_bcount = 1; + req->ip2_bcount = 1; + req->nr_exchanges = 1; + req->resblks = 2; + break; + } + + return xfs_exchmaps_estimate_overhead(req); +} + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xrep_tempexch_reserve_quota( + struct xfs_scrub *sc, + const struct xrep_tempexch *tx) +{ + struct xfs_trans *tp = sc->tp; + const struct xfs_exchmaps_req *req = &tx->req; + int64_t ddelta, rdelta; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + /* + * Quota reservation for each file comes from two sources. First, we + * need to account for any net gain in mapped blocks during the + * exchange. Second, we need reservation for the gross gain in mapped + * blocks so that we don't trip over any quota block reservation + * assertions. We must reserve the gross gain because the quota code + * subtracts from bcount the number of blocks that we unmap; it does + * not add that quantity back to the quota block reservation. + */ + ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); + rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, + true); + if (error) + return error; + + ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); + rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); + return xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, + true); +} + +/* + * Prepare an existing transaction for an atomic file contents exchange. + * + * This function fills out the mapping exchange request and resource estimation + * structures in preparation for exchanging the contents of a metadata file + * that has been rebuilt in the temp file. Next, it reserves space and quota + * for the transaction. + * + * The caller must hold ILOCK_EXCL of the scrub target file and the temporary + * file. The caller must join both inodes to the transaction with no unlock + * flags, and is responsible for dropping both ILOCKs when appropriate. Only + * use this when those ILOCKs cannot be dropped. + */ +int +xrep_tempexch_trans_reserve( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(sc->tp != NULL); + xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); + + error = xrep_tempexch_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xfs_exchmaps_estimate(&tx->req); + if (error) + return error; + + error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); + if (error) + return error; + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Create a new transaction for a file contents exchange. + * + * This function fills out the mapping excahange request and resource + * estimation structures in preparation for exchanging the contents of a + * metadata file that has been rebuilt in the temp file. Next, it reserves + * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and + * reserves quota for the transaction. + * + * The caller is responsible for dropping both ILOCKs when appropriate. + */ +int +xrep_tempexch_trans_alloc( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + unsigned int flags = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(xfs_has_exchange_range(sc->mp)); + + error = xrep_tempexch_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xrep_tempexch_estimate(sc, tx); + if (error) + return error; + + if (xfs_has_lazysbcount(sc->mp)) + flags |= XFS_TRANS_RES_FDBLKS; + + error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + tx->req.resblks, 0, flags, &sc->tp); + if (error) + return error; + + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip); + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Exchange file mappings (and hence file contents) between the file being + * repaired and the temporary file. Returns with both inodes locked and joined + * to a clean scrub transaction. + */ +int +xrep_tempexch_contents( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(xfs_has_exchange_range(sc->mp)); + + xfs_exchange_mappings(sc->tp, &tx->req); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * If we exchanged the ondisk sizes of two metadata files, we must + * exchanged the incore sizes as well. + */ + if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) { + loff_t temp; + + temp = i_size_read(VFS_I(sc->ip)); + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + i_size_write(VFS_I(sc->tempip), temp); + } + + return 0; +} + +/* + * Write local format data from one of the temporary file's forks into the same + * fork of file being repaired, and exchange the file sizes, if appropriate. + * Caller must ensure that the file being repaired has enough fork space to + * hold all the bytes. + */ +void +xrep_tempfile_copyout_local( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *temp_ifp; + struct xfs_ifork *ifp; + unsigned int ilog_flags = XFS_ILOG_CORE; + + temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork); + ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(temp_ifp != NULL); + ASSERT(ifp != NULL); + ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + + switch (whichfork) { + case XFS_DATA_FORK: + ASSERT(sc->tempip->i_disk_size <= + xfs_inode_data_fork_size(sc->ip)); + break; + case XFS_ATTR_FORK: + ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff); + break; + default: + ASSERT(0); + return; + } + + /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */ + xfs_idestroy_fork(ifp); + xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data, + temp_ifp->if_bytes); + + if (whichfork == XFS_DATA_FORK) { + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + sc->ip->i_disk_size = sc->tempip->i_disk_size; + } + + ilog_flags |= xfs_ilog_fdata(whichfork); + xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags); +} + +/* Decide if a given XFS inode is a temporary file for a repair. */ +bool +xrep_is_tempfile( + const struct xfs_inode *ip) +{ + const struct inode *inode = &ip->i_vnode; + + if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) + return true; + + return false; +} diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h new file mode 100644 index 000000000000..e51399f595fe --- /dev/null +++ b/fs/xfs/scrub/tempfile.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPFILE_H__ +#define __XFS_SCRUB_TEMPFILE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode); +void xrep_tempfile_rele(struct xfs_scrub *sc); + +bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc); +int xrep_tempfile_iolock_polled(struct xfs_scrub *sc); +void xrep_tempfile_iounlock(struct xfs_scrub *sc); + +void xrep_tempfile_ilock(struct xfs_scrub *sc); +bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc); +void xrep_tempfile_iunlock(struct xfs_scrub *sc); +void xrep_tempfile_iunlock_both(struct xfs_scrub *sc); +void xrep_tempfile_ilock_both(struct xfs_scrub *sc); + +int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len); + +enum xfs_blft; + +typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc, + struct xfs_buf *bp, void *data); + +int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data); + +int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize); + +int xrep_tempfile_roll_trans(struct xfs_scrub *sc); +void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork); +bool xrep_is_tempfile(const struct xfs_inode *ip); +#else +static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) +{ + xchk_ilock(sc, XFS_IOLOCK_EXCL); +} +# define xrep_is_tempfile(ip) (false) +# define xrep_tempfile_rele(sc) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPFILE_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 46249e7b17e0..4470ad0533b8 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -13,9 +13,25 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_quota.h" +#include "xfs_quota_defs.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" +#include "xfs_rmap.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" +#include "scrub/quota.h" +#include "scrub/iscan.h" +#include "scrub/orphanage.h" +#include "scrub/nlinks.h" +#include "scrub/fscounters.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfblob.h" +#include "scrub/dirtree.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t @@ -28,7 +44,7 @@ xchk_btree_cur_fsbno( xfs_buf_daddr(cur->bc_levels[level].bp)); if (level == cur->bc_nlevels - 1 && - (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + cur->bc_ops->type == XFS_BTREE_TYPE_INODE) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); return NULLFSBLOCK; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b1e6879a0731..2fbc8508ccdf 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -15,10 +15,21 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +#include "xfs_quota_defs.h" +struct xfs_scrub; struct xfile; struct xfarray; struct xfarray_sortinfo; +struct xchk_dqiter; +struct xchk_iscan; +struct xchk_nlink; +struct xchk_fscounters; +struct xfs_rmap_update_params; +struct xfs_parent_rec; +enum xchk_dirpath_outcome; +struct xchk_dirtree; +struct xchk_dirtree_outcomes; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -26,14 +37,6 @@ struct xfarray_sortinfo; * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); -TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); @@ -62,6 +65,11 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -88,7 +96,12 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ - { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ + { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ + { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ + { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ + { XFS_SCRUB_TYPE_BARRIER, "barrier" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -106,8 +119,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ + { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ + { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \ + { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } +TRACE_DEFINE_ENUM(XFS_RMAP_MAP); +TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC); +TRACE_DEFINE_ENUM(XFS_RMAP_FREE); + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -151,6 +177,8 @@ DEFINE_EVENT(xchk_class, name, \ DEFINE_SCRUB_EVENT(xchk_start); DEFINE_SCRUB_EVENT(xchk_done); DEFINE_SCRUB_EVENT(xchk_deadlock_retry); +DEFINE_SCRUB_EVENT(xchk_dirtree_start); +DEFINE_SCRUB_EVENT(xchk_dirtree_done); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); @@ -181,6 +209,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \ DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); +DECLARE_EVENT_CLASS(xchk_vector_head_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), + TP_ARGS(ip, vhead), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(unsigned short, rest_us) + __field(unsigned short, nr_vecs) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = vhead->svh_agno; + __entry->inum = vhead->svh_ino; + __entry->gen = vhead->svh_gen; + __entry->flags = vhead->svh_flags; + __entry->rest_us = vhead->svh_rest_us; + __entry->nr_vecs = vhead->svh_nr; + ), + TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->rest_us, + __entry->nr_vecs) +) +#define DEFINE_SCRUBV_HEAD_EVENT(name) \ +DEFINE_EVENT(xchk_vector_head_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \ + TP_ARGS(ip, vhead)) + +DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start); + +DECLARE_EVENT_CLASS(xchk_vector_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, + unsigned int vec_nr, struct xfs_scrub_vec *v), + TP_ARGS(mp, vhead, vec_nr, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, vec_nr) + __field(unsigned int, vec_type) + __field(unsigned int, vec_flags) + __field(int, vec_ret) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->vec_nr = vec_nr; + __entry->vec_type = v->sv_type; + __entry->vec_flags = v->sv_flags; + __entry->vec_ret = v->sv_ret; + ), + TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->vec_nr, + __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS), + __entry->vec_ret) +) +#define DEFINE_SCRUBV_EVENT(name) \ +DEFINE_EVENT(xchk_vector_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \ + unsigned int vec_nr, struct xfs_scrub_vec *v), \ + TP_ARGS(mp, vhead, vec_nr, v)) + +DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail); +DEFINE_SCRUBV_EVENT(xchk_scrubv_item); +DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -346,6 +449,78 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen); + +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xchk_dqiter_class, + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), + TP_ARGS(cursor, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_ino_t, ino) + __field(unsigned long long, cur_id) + __field(unsigned long long, id) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + ), + TP_fast_assign( + __entry->dev = cursor->sc->mp->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id; + __entry->startoff = cursor->bmap.br_startoff; + __entry->startblock = cursor->bmap.br_startblock; + __entry->blockcount = cursor->bmap.br_blockcount; + __entry->state = cursor->bmap.br_state; + __entry->id = id; + ), + TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->ino, + __entry->cur_id, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->id) +); + +#define DEFINE_SCRUB_DQITER_EVENT(name) \ +DEFINE_EVENT(xchk_dqiter_class, name, \ + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \ + TP_ARGS(cursor, id)) +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); + +TRACE_EVENT(xchk_qcheck_error, + TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id, + void *ret_ip), + TP_ARGS(sc, dqtype, id, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_dqid_t, id) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->dqtype = dqtype; + __entry->id = id; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->id, + __entry->ret_ip) +); +#endif /* CONFIG_XFS_QUOTA */ TRACE_EVENT(xchk_incomplete, TP_PROTO(struct xfs_scrub *sc, void *ret_ip), @@ -373,7 +548,7 @@ TRACE_EVENT(xchk_btree_op_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -386,7 +561,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -394,10 +569,10 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -415,7 +590,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(int, ptr) __field(xfs_agnumber_t, agno) @@ -426,10 +601,10 @@ TRACE_EVENT(xchk_ifork_btree_op_error, TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; - __entry->ino = sc->ip->i_ino; + __entry->ino = cur->bc_ino.ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -437,12 +612,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -458,7 +633,7 @@ TRACE_EVENT(xchk_btree_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -469,17 +644,17 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -496,7 +671,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -506,22 +681,22 @@ TRACE_EVENT(xchk_ifork_btree_error, TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; - __entry->ino = cur->bc_ino.ip->i_ino; + __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -536,7 +711,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) __field(int, level) @@ -548,17 +723,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; ), - TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->bno, __entry->level, @@ -809,18 +984,11 @@ TRACE_EVENT(xfile_destroy, __field(loff_t, size) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; + struct inode *inode = file_inode(xf->file); - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + __entry->ino = inode->i_ino; + __entry->bytes = inode->i_blocks << SECTOR_SHIFT; + __entry->size = i_size_read(inode); ), TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", __entry->ino, @@ -839,19 +1007,12 @@ DECLARE_EVENT_CLASS(xfile_class, __field(unsigned long long, bytecount) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; + struct inode *inode = file_inode(xf->file); - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes_used = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes_used = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + __entry->ino = inode->i_ino; + __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT; __entry->pos = pos; + __entry->size = i_size_read(inode); __entry->bytecount = bytecount; ), TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", @@ -865,11 +1026,12 @@ DECLARE_EVENT_CLASS(xfile_class, DEFINE_EVENT(xfile_class, name, \ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ TP_ARGS(xf, pos, bytecount)) -DEFINE_XFILE_EVENT(xfile_pread); -DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_load); +DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); -DEFINE_XFILE_EVENT(xfile_get_page); -DEFINE_XFILE_EVENT(xfile_put_page); +DEFINE_XFILE_EVENT(xfile_get_folio); +DEFINE_XFILE_EVENT(xfile_put_folio); +DEFINE_XFILE_EVENT(xfile_discard); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), @@ -916,7 +1078,7 @@ TRACE_EVENT(xfarray_isort, __entry->hi - __entry->lo) ); -TRACE_EVENT(xfarray_pagesort, +TRACE_EVENT(xfarray_foliosort, TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), TP_ARGS(si, lo, hi), TP_STRUCT__entry( @@ -987,6 +1149,47 @@ TRACE_EVENT(xfarray_sort, __entry->bytes) ); +TRACE_EVENT(xfarray_sort_scan, + TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx), + TP_ARGS(si, idx), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(unsigned long long, idx) + __field(unsigned long long, folio_pos) + __field(unsigned long, folio_bytes) + __field(unsigned long long, first_idx) + __field(unsigned long long, last_idx) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->idx = idx; + if (si->folio) { + __entry->folio_pos = folio_pos(si->folio); + __entry->folio_bytes = folio_size(si->folio); + __entry->first_idx = si->first_folio_idx; + __entry->last_idx = si->last_folio_idx; + } else { + __entry->folio_pos = 0; + __entry->folio_bytes = 0; + __entry->first_idx = 0; + __entry->last_idx = 0; + } + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->idx, + __entry->folio_pos, + __entry->folio_bytes, + __entry->first_idx, + __entry->last_idx) +); + TRACE_EVENT(xfarray_sort_stats, TP_PROTO(struct xfarray_sortinfo *si, int error), TP_ARGS(si, error), @@ -1034,18 +1237,18 @@ TRACE_EVENT(xfarray_sort_stats, #ifdef CONFIG_XFS_RT TRACE_EVENT(xchk_rtsum_record_free, - TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start, + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtbxlen_t len, unsigned int log, loff_t pos, - xfs_suminfo_t v), - TP_ARGS(mp, start, len, log, pos, v), + xfs_suminfo_t value), + TP_ARGS(mp, start, len, log, pos, value), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, rtdev) - __field(xfs_rtblock_t, start) + __field(xfs_rtxnum_t, start) __field(unsigned long long, len) __field(unsigned int, log) __field(loff_t, pos) - __field(xfs_suminfo_t, v) + __field(xfs_suminfo_t, value) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; @@ -1054,7 +1257,7 @@ TRACE_EVENT(xchk_rtsum_record_free, __entry->len = len; __entry->log = log; __entry->pos = pos; - __entry->v = v; + __entry->value = value; ), TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1063,10 +1266,654 @@ TRACE_EVENT(xchk_rtsum_record_free, __entry->len, __entry->log, __entry->pos, - __entry->v) + __entry->value) ); #endif /* CONFIG_XFS_RT */ +DECLARE_EVENT_CLASS(xchk_iscan_class, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited) +) +#define DEFINE_ISCAN_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); +DEFINE_ISCAN_EVENT(xchk_iscan_visit); +DEFINE_ISCAN_EVENT(xchk_iscan_skip); +DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); + +DECLARE_EVENT_CLASS(xchk_iscan_ino_class, + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), + TP_ARGS(iscan, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, startino) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->startino = iscan->scan_start_ino; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->startino, + __entry->cursor, + __entry->visited, + __entry->ino) +) +#define DEFINE_ISCAN_INO_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_ino_class, name, \ + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \ + TP_ARGS(iscan, ino)) +DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update); +DEFINE_ISCAN_INO_EVENT(xchk_iscan_start); + +TRACE_EVENT(xchk_iscan_iget, + TP_PROTO(struct xchk_iscan *iscan, int error), + TP_ARGS(iscan, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->error = error; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->error) +); + +TRACE_EVENT(xchk_iscan_iget_batch, + TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan, + unsigned int nr, unsigned int avail), + TP_ARGS(mp, iscan, nr, avail), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, nr) + __field(unsigned int, avail) + __field(unsigned int, unavail) + __field(xfs_ino_t, batch_ino) + __field(unsigned long long, skipmask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->nr = nr; + __entry->avail = avail; + __entry->unavail = hweight64(iscan->__skipped_inomask); + __entry->batch_ino = iscan->__batch_ino; + __entry->skipmask = iscan->__skipped_inomask; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->batch_ino, + __entry->skipmask, + __entry->nr, + __entry->avail, + __entry->unavail) +); + +DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, retry_delay) + __field(unsigned long, remaining) + __field(unsigned int, iget_timeout) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->retry_delay = iscan->iget_retry_delay; + __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies); + __entry->iget_timeout = iscan->iget_timeout; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->remaining, + __entry->iget_timeout, + __entry->retry_delay) +) +#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait); +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait); + +TRACE_EVENT(xchk_nlinks_collect_dirent, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + xfs_ino_t ino, const struct xfs_name *name), + TP_ARGS(mp, dp, ino, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_collect_pptr, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(mp, dp, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = be64_to_cpu(pptr->p_ino); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_collect_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp, + int action, xfs_ino_t ino, int delta, + const char *name, unsigned int namelen), + TP_ARGS(mp, dp, action, ino, delta, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(int, action) + __field(xfs_ino_t, ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->action = action; + __entry->ino = ino; + __entry->delta = delta; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +TRACE_EVENT(xchk_nlinks_update_incore, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live, int parents_delta, + int backrefs_delta, int children_delta), + TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + __field(int, parents_delta) + __field(int, backrefs_delta) + __field(int, children_delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + __entry->parents_delta = parents_delta; + __entry->backrefs_delta = backrefs_delta; + __entry->children_delta = children_delta; + ), + TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents_delta, + __entry->parents, + __entry->backrefs_delta, + __entry->backrefs, + __entry->children_delta, + __entry->children) +); + +DECLARE_EVENT_CLASS(xchk_nlinks_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->nlink, + __entry->parents, + __entry->backrefs, + __entry->children) +); +#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlinks_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); + +DECLARE_EVENT_CLASS(xchk_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + xfs_ino_t far_ino), + TP_ARGS(ip, name, far_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, far_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name, name->len); + __entry->far_ino = far_ino; + ), + TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __get_str(name), + __entry->far_ino) +) +#define DEFINE_XCHK_PPTR_EVENT(name) \ +DEFINE_EVENT(xchk_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + xfs_ino_t far_ino), \ + TP_ARGS(ip, name, far_ino)) +DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath); + +DECLARE_EVENT_CLASS(xchk_dirtree_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRTREE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, name, pptr)) +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path); +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards); + +DECLARE_EVENT_CLASS(xchk_dirpath_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, unsigned int step_nr, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRPATH_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, unsigned int step_nr, \ + const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr)) +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step); + +TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED); + +#define XCHK_DIRPATH_OUTCOME_STRINGS \ + { XCHK_DIRPATH_SCANNING, "scanning" }, \ + { XCHK_DIRPATH_DELETE, "delete" }, \ + { XCHK_DIRPATH_CORRUPT, "corrupt" }, \ + { XCHK_DIRPATH_LOOP, "loop" }, \ + { XCHK_DIRPATH_STALE, "stale" }, \ + { XCHK_DIRPATH_OK, "ok" }, \ + { XREP_DIRPATH_DELETING, "deleting" }, \ + { XREP_DIRPATH_DELETED, "deleted" }, \ + { XREP_DIRPATH_ADOPTING, "adopting" }, \ + { XREP_DIRPATH_ADOPTED, "adopted" } + +DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class, + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, + unsigned int nr_steps, \ + unsigned int outcome), + TP_ARGS(sc, path_nr, nr_steps, outcome), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, path_nr) + __field(unsigned int, nr_steps) + __field(unsigned int, outcome) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->nr_steps = nr_steps; + __entry->outcome = outcome; + ), + TP_printk("dev %d:%d path %llu steps %u outcome %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->nr_steps, + __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS)) +); +#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_outcome_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \ + unsigned int nr_steps, \ + unsigned int outcome), \ + TP_ARGS(sc, path_nr, nr_steps, outcome)) +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome); +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path); + +DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class, + TP_PROTO(const struct xchk_dirtree *dl, + const struct xchk_dirtree_outcomes *oc), + TP_ARGS(dl, oc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, rootino) + __field(unsigned int, nr_paths) + __field(unsigned int, bad) + __field(unsigned int, suspect) + __field(unsigned int, good) + __field(bool, needs_adoption) + ), + TP_fast_assign( + __entry->dev = dl->sc->mp->m_super->s_dev; + __entry->ino = dl->sc->ip->i_ino; + __entry->rootino = dl->root_ino; + __entry->nr_paths = dl->nr_paths; + __entry->bad = oc->bad; + __entry->suspect = oc->suspect; + __entry->good = oc->good; + __entry->needs_adoption = oc->needs_adoption ? 1 : 0; + ), + TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->rootino, + __entry->nr_paths, + __entry->bad, + __entry->suspect, + __entry->good, + __entry->needs_adoption) +); +#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \ + TP_PROTO(const struct xchk_dirtree *dl, \ + const struct xchk_dirtree_outcomes *oc), \ + TP_ARGS(dl, oc)) +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate); + +TRACE_EVENT(xchk_dirpath_changed, + TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr, + unsigned int step_nr, const struct xfs_inode *dp, + const struct xfs_inode *ip, const struct xfs_name *xname), + TP_ARGS(sc, path_nr, step_nr, dp, ip, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->parent_ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_dirtree_live_update, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp, + int action, const struct xfs_inode *ip, int delta, + const struct xfs_name *xname), + TP_ARGS(sc, dp, action, ip, delta, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, parent_ino) + __field(int, action) + __field(xfs_ino_t, child_ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->parent_ino = dp->i_ino; + __entry->action = action; + __entry->child_ino = ip->i_ino; + __entry->delta = delta; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->parent_ino, + __entry->child_ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -1098,6 +1945,7 @@ DEFINE_EVENT(xrep_extent_class, name, \ DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); +DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, @@ -1131,6 +1979,7 @@ DEFINE_EVENT(xrep_reap_find_class, name, \ bool crosslinked), \ TP_ARGS(pag, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); DECLARE_EVENT_CLASS(xrep_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1170,37 +2019,156 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); -TRACE_EVENT(xrep_refcount_extent_fn, +TRACE_EVENT(xrep_abt_found, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), + const struct xfs_alloc_rec_incore *rec), + TP_ARGS(mp, agno, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) - __field(xfs_nlink_t, refcount) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startblock = irec->rc_startblock; - __entry->blockcount = irec->rc_blockcount; - __entry->refcount = irec->rc_refcount; + __entry->startblock = rec->ar_startblock; + __entry->blockcount = rec->ar_blockcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, + __entry->blockcount) +) + +TRACE_EVENT(xrep_ibt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->count = rec->ir_count; + __entry->freecount = rec->ir_freecount; + __entry->freemask = rec->ir_free; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +TRACE_EVENT(xrep_refc_found, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_ARGS(pag, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = rec->rc_domain; + __entry->startblock = rec->rc_startblock; + __entry->blockcount = rec->rc_blockcount; + __entry->refcount = rec->rc_refcount; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, __entry->blockcount, __entry->refcount) ) +TRACE_EVENT(xrep_bmap_found, + TP_PROTO(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); + +TRACE_EVENT(xrep_rmap_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_rmap_irec *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), @@ -1285,51 +2253,1325 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->refcbt_sz) ) TRACE_EVENT(xrep_reset_counters, - TP_PROTO(struct xfs_mount *mp), - TP_ARGS(mp), + TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), + TP_ARGS(mp, fsc), TP_STRUCT__entry( __field(dev_t, dev) + __field(uint64_t, icount) + __field(uint64_t, ifree) + __field(uint64_t, fdblocks) + __field(uint64_t, frextents) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __entry->icount = fsc->icount; + __entry->ifree = fsc->ifree; + __entry->fdblocks = fsc->fdblocks; + __entry->frextents = fsc->frextents; ), - TP_printk("dev %d:%d", - MAJOR(__entry->dev), MINOR(__entry->dev)) + TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount, + __entry->ifree, + __entry->fdblocks, + __entry->frextents) ) -TRACE_EVENT(xrep_ialloc_insert, +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, uint16_t holemask, uint8_t count, - uint8_t freecount, uint64_t freemask), - TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - __field(uint16_t, holemask) - __field(uint8_t, count) - __field(uint8_t, freecount) - __field(uint64_t, freemask) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startino = startino; - __entry->holemask = holemask; - __entry->count = count; - __entry->freecount = freecount; - __entry->freemask = freemask; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->startino, - __entry->holemask, - __entry->count, - __entry->freecount, - __entry->freemask) + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + +DECLARE_EVENT_CLASS(xrep_dinode_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), + TP_ARGS(sc, dip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint16_t, mode) + __field(uint8_t, version) + __field(uint8_t, format) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(uint64_t, size) + __field(uint64_t, nblocks) + __field(uint32_t, extsize) + __field(uint32_t, nextents) + __field(uint16_t, anextents) + __field(uint8_t, forkoff) + __field(uint8_t, aformat) + __field(uint16_t, flags) + __field(uint32_t, gen) + __field(uint64_t, flags2) + __field(uint32_t, cowextsize) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->mode = be16_to_cpu(dip->di_mode); + __entry->version = dip->di_version; + __entry->format = dip->di_format; + __entry->uid = be32_to_cpu(dip->di_uid); + __entry->gid = be32_to_cpu(dip->di_gid); + __entry->size = be64_to_cpu(dip->di_size); + __entry->nblocks = be64_to_cpu(dip->di_nblocks); + __entry->extsize = be32_to_cpu(dip->di_extsize); + __entry->nextents = be32_to_cpu(dip->di_nextents); + __entry->anextents = be16_to_cpu(dip->di_anextents); + __entry->forkoff = dip->di_forkoff; + __entry->aformat = dip->di_aformat; + __entry->flags = be16_to_cpu(dip->di_flags); + __entry->gen = be32_to_cpu(dip->di_gen); + __entry->flags2 = be64_to_cpu(dip->di_flags2); + __entry->cowextsize = be32_to_cpu(dip->di_cowextsize); + ), + TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->mode, + __entry->version, + __entry->format, + __entry->uid, + __entry->gid, + __entry->size, + __entry->nblocks, + __entry->extsize, + __entry->nextents, + __entry->anextents, + __entry->forkoff, + __entry->aformat, + __entry->flags, + __entry->gen, + __entry->flags2, + __entry->cowextsize) +) + +#define DEFINE_REPAIR_DINODE_EVENT(name) \ +DEFINE_EVENT(xrep_dinode_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \ + TP_ARGS(sc, dip)) +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff); + +DECLARE_EVENT_CLASS(xrep_inode_class, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_rfsblock_t, nblocks) + __field(uint16_t, flags) + __field(uint64_t, flags2) + __field(uint32_t, nextents) + __field(uint8_t, format) + __field(uint32_t, anextents) + __field(uint8_t, aformat) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->size = sc->ip->i_disk_size; + __entry->nblocks = sc->ip->i_nblocks; + __entry->flags = sc->ip->i_diflags; + __entry->flags2 = sc->ip->i_diflags2; + __entry->nextents = sc->ip->i_df.if_nextents; + __entry->format = sc->ip->i_df.if_format; + __entry->anextents = sc->ip->i_af.if_nextents; + __entry->aformat = sc->ip->i_af.if_format; + ), + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->nblocks, + __entry->flags, + __entry->flags2, + __entry->nextents, + __entry->format, + __entry->anextents, + __entry->aformat) ) +#define DEFINE_REPAIR_INODE_EVENT(name) \ +DEFINE_EVENT(xrep_inode_class, name, \ + TP_PROTO(struct xfs_scrub *sc), \ + TP_ARGS(sc)) +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); + +TRACE_EVENT(xrep_dinode_count_rmaps, + TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks, + xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks, + xfs_extnum_t data_extents, xfs_extnum_t rt_extents, + xfs_aextnum_t attr_extents), + TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents, + rt_extents, attr_extents), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_rfsblock_t, data_blocks) + __field(xfs_rfsblock_t, rt_blocks) + __field(xfs_rfsblock_t, attr_blocks) + __field(xfs_extnum_t, data_extents) + __field(xfs_extnum_t, rt_extents) + __field(xfs_aextnum_t, attr_extents) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->data_blocks = data_blocks; + __entry->rt_blocks = rt_blocks; + __entry->attr_blocks = attr_blocks; + __entry->data_extents = data_extents; + __entry->rt_extents = rt_extents; + __entry->attr_extents = attr_extents; + ), + TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->data_blocks, + __entry->rt_blocks, + __entry->attr_blocks, + __entry->data_extents, + __entry->rt_extents, + __entry->attr_extents) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype), + TP_ARGS(sc, dp, ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent_inval, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype, unsigned int found_ftype), + TP_ARGS(sc, dp, ftype, found_ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + __field(unsigned int, found_ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + __entry->found_ftype = found_ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_cow_mark_file_range, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, + xfs_fileoff_t startoff, xfs_filblks_t blockcount), + TP_ARGS(ip, startblock, startoff, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = startoff; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount) +); + +TRACE_EVENT(xrep_cow_replace_mapping, + TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec, + xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount), + TP_ARGS(ip, irec, new_startblock, new_blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(xfs_fsblock_t, new_startblock) + __field(xfs_extlen_t, new_blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = irec->br_startoff; + __entry->startblock = irec->br_startblock; + __entry->blockcount = irec->br_blockcount; + __entry->state = irec->br_state; + __entry->new_startblock = new_startblock; + __entry->new_blockcount = new_blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->new_startblock, + __entry->new_blockcount) +); + +TRACE_EVENT(xrep_cow_free_staging, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t blockcount), + TP_ARGS(pag, agbno, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->blockcount) +); + +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xrep_dquot_class, + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), + TP_ARGS(mp, type, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint8_t, type) + __field(uint32_t, id) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->id = id; + __entry->type = type; + ), + TP_printk("dev %d:%d type %s id 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __entry->id) +); + +#define DEFINE_XREP_DQUOT_EVENT(name) \ +DEFINE_EVENT(xrep_dquot_class, name, \ + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \ + TP_ARGS(mp, type, id)) +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); +DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); +#endif /* CONFIG_XFS_QUOTA */ + +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); + +TRACE_EVENT(xrep_rmap_live_update, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + const struct xfs_rmap_update_params *p), + TP_ARGS(mp, agno, op, p), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, op) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->op = op; + __entry->agbno = p->startblock; + __entry->len = p->blockcount; + xfs_owner_info_unpack(&p->oinfo, &__entry->owner, + &__entry->offset, &__entry->flags); + if (p->unwritten) + __entry->flags |= XFS_RMAP_UNWRITTEN; + ), + TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->op, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + +TRACE_EVENT(xrep_tempfile_create, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(xfs_ino_t, temp_inum) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0; + __entry->type = sc->sm->sm_type; + __entry->agno = sc->sm->sm_agno; + __entry->inum = sc->sm->sm_ino; + __entry->gen = sc->sm->sm_gen; + __entry->flags = sc->sm->sm_flags; + __entry->temp_inum = sc->tempip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->inum, + __entry->gen, + __entry->flags, + __entry->temp_inum) +); + +DECLARE_EVENT_CLASS(xrep_tempfile_class, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(sc, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->tempip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); +#define DEFINE_XREP_TEMPFILE_EVENT(name) \ +DEFINE_EVENT(xrep_tempfile_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int whichfork, \ + struct xfs_bmbt_irec *irec), \ + TP_ARGS(sc, whichfork, irec)) +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc); +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin); + +TRACE_EVENT(xreap_ifork_extent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork, + const struct xfs_bmbt_irec *irec), + TP_ARGS(sc, ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, fileoff) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->fileoff = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->agno, + __entry->agbno, + __entry->fileoff, + __entry->len, + __entry->state) +); + +TRACE_EVENT(xreap_bmapi_binval_scan, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec, + xfs_extlen_t scan_blocks), + TP_ARGS(sc, irec, scan_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, scan_blocks) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->scan_blocks = scan_blocks; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->scan_blocks) +); + +TRACE_EVENT(xrep_xattr_recover_leafblock, + TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic), + TP_ARGS(ip, dabno, magic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_dablk_t, dabno) + __field(uint16_t, magic) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + ), + TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->dabno, + __entry->magic) +); + +DECLARE_EVENT_CLASS(xrep_xattr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, + unsigned int namelen, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, flags) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + __field(unsigned int, valuelen) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->flags = flags; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + __entry->valuelen = valuelen; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR), + __entry->namelen, + __get_str(name), + __entry->valuelen) +); +#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \ + unsigned int namelen, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, valuelen)) +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr); + +DECLARE_EVENT_CLASS(xrep_pptr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, + unsigned int namelen, const void *value, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, value, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + const struct xfs_parent_rec *rec = value; + + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(rec->p_ino); + __entry->parent_gen = be32_to_cpu(rec->p_gen); + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \ + unsigned int namelen, const void *value, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, value, valuelen)) +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr); +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr); + +TRACE_EVENT(xrep_xattr_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), + TP_ARGS(ip, arg_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, src_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->src_ino = arg_ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx src 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->src_ino) +) +#define DEFINE_XREP_XATTR_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \ + TP_ARGS(ip, arg_ip)) +DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset); + +DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd); +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove); + +TRACE_EVENT(xrep_dir_recover_dirblock, + TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic, + uint32_t magic_guess), + TP_ARGS(dp, dabno, magic, magic_guess), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_dablk_t, dabno) + __field(uint32_t, magic) + __field(uint32_t, magic_guess) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + __entry->magic_guess = magic_guess; + ), + TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->dabno, + __entry->magic, + __entry->magic_guess) +); + +DECLARE_EVENT_CLASS(xrep_dir_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), + TP_ARGS(dp, parent_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, parent_ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->parent_ino = parent_ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->parent_ino) +) +#define DEFINE_XREP_DIR_EVENT(name) \ +DEFINE_EVENT(xrep_dir_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \ + TP_ARGS(dp, parent_ino)) +DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree); +DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork); +DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot); + +DECLARE_EVENT_CLASS(xrep_dirent_class, + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, + xfs_ino_t ino), + TP_ARGS(dp, name, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + __entry->ino = ino; + __entry->ftype = name->type; + ), + TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->namelen, + __get_str(name), + __entry->ino) +) +#define DEFINE_XREP_DIRENT_EVENT(name) \ +DEFINE_EVENT(xrep_dirent_class, name, \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \ + xfs_ino_t ino), \ + TP_ARGS(dp, name, ino)) +DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename); + +DECLARE_EVENT_CLASS(xrep_adoption_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), + TP_ARGS(dp, ip, moved), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, child_ino) + __field(bool, moved) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->child_ino = ip->i_ino; + __entry->moved = moved; + ), + TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->child_ino, + __entry->moved) +); +#define DEFINE_XREP_ADOPTION_EVENT(name) \ +DEFINE_EVENT(xrep_adoption_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \ + TP_ARGS(dp, ip, moved)) +DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll); + +DECLARE_EVENT_CLASS(xrep_parent_salvage_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->ino) +) +#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_parent_salvage_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(dp, ino)) +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache); + +DECLARE_EVENT_CLASS(xrep_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(ip, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(ip, name, pptr)) +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove); + +DECLARE_EVENT_CLASS(xrep_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd); +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove); + +TRACE_EVENT(xrep_nlinks_set_record, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *obs), + TP_ARGS(mp, ino, obs), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = obs->parents; + __entry->backrefs = obs->backrefs; + __entry->children = obs->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +DECLARE_EVENT_CLASS(xrep_dentry_class, + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), + TP_ARGS(mp, dentry), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + __field(unsigned long, ino) + __field(bool, positive) + __field(unsigned long, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, dentry->d_name.len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = dentry->d_flags; + __entry->positive = d_is_positive(dentry); + if (dentry->d_parent && d_inode(dentry->d_parent)) + __entry->parent_ino = d_inode(dentry->d_parent)->i_ino; + else + __entry->parent_ino = -1UL; + __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0; + __entry->namelen = dentry->d_name.len; + memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len); + ), + TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->positive, + __entry->parent_ino, + __entry->ino, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_REPAIR_DENTRY_EVENT(name) \ +DEFINE_EVENT(xrep_dentry_class, name, \ + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \ + TP_ARGS(mp, dentry)) +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child); + +TRACE_EVENT(xrep_symlink_salvage_target, + TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen), + TP_ARGS(ip, target, targetlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, targetlen) + __dynamic_array(char, target, targetlen + 1) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->targetlen = targetlen; + memcpy(__get_str(target), target, targetlen); + __get_str(target)[targetlen] = 0; + ), + TP_printk("dev %d:%d ip 0x%llx target '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->targetlen, + __get_str(target)) +); + +DECLARE_EVENT_CLASS(xrep_symlink_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ip 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +#define DEFINE_XREP_SYMLINK_EVENT(name) \ +DEFINE_EVENT(xrep_symlink_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild); +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork); + +TRACE_EVENT(xrep_iunlink_visit, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t bucket_agino, struct xfs_inode *ip), + TP_ARGS(pag, bucket, bucket_agino, ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, bucket) + __field(xfs_agino_t, bucket_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino); + __entry->bucket = bucket; + __entry->bucket_agino = bucket_agino; + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->bucket_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_prev_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + __field(unsigned int, nlink) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->old_prev_agino = ip->i_prev_unlinked; + __entry->prev_agino = prev_agino; + __entry->next_agino = ip->i_next_unlinked; + __entry->nlink = VFS_I(ip)->i_nlink; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->old_prev_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_ondisk, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, nlink) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); + +DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); +#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \ +DEFINE_EVENT(xrep_iunlink_resolve_class, name, \ + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \ + xfs_agino_t prev_agino, xfs_agino_t next_agino), \ + TP_ARGS(pag, bucket, prev_agino, next_agino)) +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok); + +TRACE_EVENT(xrep_iunlink_relink_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino), + TP_ARGS(ip, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + __field(xfs_agino_t, new_next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->next_agino = ip->i_next_unlinked; + __entry->new_next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->next_agino, + __entry->new_next_agino) +); + +TRACE_EVENT(xrep_iunlink_relink_prev, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, new_prev_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->new_prev_agino = prev_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->prev_agino, + __entry->new_prev_agino) +); + +TRACE_EVENT(xrep_iunlink_add_to_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t agino, xfs_agino_t curr_head), + TP_ARGS(pag, bucket, agino, curr_head), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->agino = agino; + __entry->next_agino = curr_head; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_commit_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t old_agino, xfs_agino_t agino), + TP_ARGS(pag, bucket, old_agino, agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_agino) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->old_agino = old_agino; + __entry->agino = agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_agino, + __entry->agino) +); + +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption); +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index f0f532c10a5a..cdd13ed9c569 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -7,16 +7,16 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" /* * Large Arrays of Fixed-Size Records * ================================== * - * This memory array uses an xfile (which itself is a memfd "file") to store + * This memory array uses an xfile (which itself is a shmem file) to store * large numbers of fixed-size records in memory that can be paged out. This * puts less stress on the memory reclaim algorithms during an online repair * because we don't have to pin so much memory. However, array access is less @@ -136,7 +136,7 @@ xfarray_load( if (idx >= array->nr) return -ENODATA; - return xfile_obj_load(array->xfile, ptr, array->obj_size, + return xfile_load(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); } @@ -152,7 +152,7 @@ xfarray_is_unset( if (array->unset_slots == 0) return false; - error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (!error && xfarray_element_is_null(array, temp)) return true; @@ -184,7 +184,7 @@ xfarray_unset( return 0; memset(temp, 0, array->obj_size); - error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + error = xfile_store(array->xfile, temp, array->obj_size, pos); if (error) return error; @@ -209,7 +209,7 @@ xfarray_store( ASSERT(!xfarray_element_is_null(array, ptr)); - ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + ret = xfile_store(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); if (ret) return ret; @@ -245,12 +245,12 @@ xfarray_store_anywhere( for (pos = 0; pos < endpos && array->unset_slots > 0; pos += array->obj_size) { - error = xfile_obj_load(array->xfile, temp, array->obj_size, + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (error || !xfarray_element_is_null(array, temp)) continue; - error = xfile_obj_store(array->xfile, ptr, array->obj_size, + error = xfile_store(array->xfile, ptr, array->obj_size, pos); if (error) return error; @@ -486,6 +486,9 @@ xfarray_sortinfo_alloc( xfarray_sortinfo_lo(si)[0] = 0; xfarray_sortinfo_hi(si)[0] = array->nr - 1; + si->relax = INIT_XCHK_RELAX; + if (flags & XFARRAY_SORT_KILLABLE) + si->relax.interruptible = false; trace_xfarray_sort(si, nr_bytes); *infop = si; @@ -503,10 +506,7 @@ xfarray_sort_terminated( * few seconds so that we don't run afoul of the soft lockup watchdog * or RCU stall detector. */ - cond_resched(); - - if ((si->flags & XFARRAY_SORT_KILLABLE) && - fatal_signal_pending(current)) { + if (xchk_maybe_relax(&si->relax)) { if (*error == 0) *error = -EINTR; return true; @@ -552,7 +552,7 @@ xfarray_isort( trace_xfarray_isort(si, lo, hi); xfarray_sort_bump_loads(si); - error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + error = xfile_load(si->array->xfile, scratch, len, lo_pos); if (error) return error; @@ -560,88 +560,45 @@ xfarray_isort( sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); -} - -/* Grab a page for sorting records. */ -static inline int -xfarray_sort_get_page( - struct xfarray_sortinfo *si, - loff_t pos, - uint64_t len) -{ - int error; - - error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); - if (error) - return error; - - /* - * xfile pages must never be mapped into userspace, so we skip the - * dcache flush when mapping the page. - */ - si->page_kaddr = kmap_local_page(si->xfpage.page); - return 0; -} - -/* Release a page we grabbed for sorting records. */ -static inline int -xfarray_sort_put_page( - struct xfarray_sortinfo *si) -{ - if (!si->page_kaddr) - return 0; - - kunmap_local(si->page_kaddr); - si->page_kaddr = NULL; - - return xfile_put_page(si->array->xfile, &si->xfpage); + return xfile_store(si->array->xfile, scratch, len, lo_pos); } -/* Decide if these records are eligible for in-page sorting. */ -static inline bool -xfarray_want_pagesort( - struct xfarray_sortinfo *si, - xfarray_idx_t lo, - xfarray_idx_t hi) -{ - pgoff_t lo_page; - pgoff_t hi_page; - loff_t end_pos; - - /* We can only map one page at a time. */ - lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; - end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; - hi_page = end_pos >> PAGE_SHIFT; - - return lo_page == hi_page; -} - -/* Sort a bunch of records that all live in the same memory page. */ +/* + * Sort the records from lo to hi (inclusive) if they are all backed by the + * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative + * errno. + */ STATIC int -xfarray_pagesort( +xfarray_foliosort( struct xfarray_sortinfo *si, xfarray_idx_t lo, xfarray_idx_t hi) { + struct folio *folio; void *startp; loff_t lo_pos = xfarray_pos(si->array, lo); - uint64_t len = xfarray_pos(si->array, hi - lo); - int error = 0; + uint64_t len = xfarray_pos(si->array, hi - lo + 1); - trace_xfarray_pagesort(si, lo, hi); + /* No single folio could back this many records. */ + if (len > XFILE_MAX_FOLIO_SIZE) + return 0; xfarray_sort_bump_loads(si); - error = xfarray_sort_get_page(si, lo_pos, len); - if (error) - return error; + folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC); + if (IS_ERR(folio)) + return PTR_ERR(folio); + if (!folio) + return 0; + + trace_xfarray_foliosort(si, lo, hi); xfarray_sort_bump_heapsorts(si); - startp = si->page_kaddr + offset_in_page(lo_pos); + startp = folio_address(folio) + offset_in_folio(folio, lo_pos); sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfarray_sort_put_page(si); + xfile_put_folio(si->array->xfile, folio); + return 1; } /* Return a pointer to the xfarray pivot record within the sortinfo struct. */ @@ -829,63 +786,80 @@ xfarray_qsort_push( return 0; } +static inline void +xfarray_sort_scan_done( + struct xfarray_sortinfo *si) +{ + if (si->folio) + xfile_put_folio(si->array->xfile, si->folio); + si->folio = NULL; +} + /* - * Load an element from the array into the first scratchpad and cache the page, - * if possible. + * Cache the folio backing the start of the given array element. If the array + * element is contained entirely within the folio, return a pointer to the + * cached folio. Otherwise, load the element into the scratchpad and return a + * pointer to the scratchpad. */ static inline int -xfarray_sort_load_cached( +xfarray_sort_scan( struct xfarray_sortinfo *si, xfarray_idx_t idx, - void *ptr) + void **ptrp) { loff_t idx_pos = xfarray_pos(si->array, idx); - pgoff_t startpage; - pgoff_t endpage; int error = 0; - /* - * If this load would split a page, release the cached page, if any, - * and perform a traditional read. - */ - startpage = idx_pos >> PAGE_SHIFT; - endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; - if (startpage != endpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + if (xfarray_sort_terminated(si, &error)) + return error; - if (xfarray_sort_terminated(si, &error)) - return error; + trace_xfarray_sort_scan(si, idx); - return xfile_obj_load(si->array->xfile, ptr, - si->array->obj_size, idx_pos); - } + /* If the cached folio doesn't cover this index, release it. */ + if (si->folio && + (idx < si->first_folio_idx || idx > si->last_folio_idx)) + xfarray_sort_scan_done(si); - /* If the cached page is not the one we want, release it. */ - if (xfile_page_cached(&si->xfpage) && - xfile_page_index(&si->xfpage) != startpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + /* Grab the first folio that backs this array element. */ + if (!si->folio) { + struct folio *folio; + loff_t next_pos; + + folio = xfile_get_folio(si->array->xfile, idx_pos, + si->array->obj_size, XFILE_ALLOC); + if (IS_ERR(folio)) + return PTR_ERR(folio); + si->folio = folio; + + si->first_folio_idx = xfarray_idx(si->array, + folio_pos(si->folio) + si->array->obj_size - 1); + + next_pos = folio_pos(si->folio) + folio_size(si->folio); + si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); + if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) + si->last_folio_idx--; + + trace_xfarray_sort_scan(si, idx); } /* - * If we don't have a cached page (and we know the load is contained - * in a single page) then grab it. + * If this folio still doesn't cover the desired element, it must cross + * a folio boundary. Read into the scratchpad and we're done. */ - if (!xfile_page_cached(&si->xfpage)) { - if (xfarray_sort_terminated(si, &error)) - return error; + if (idx < si->first_folio_idx || idx > si->last_folio_idx) { + void *temp = xfarray_scratch(si->array); - error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, - PAGE_SIZE); + error = xfile_load(si->array->xfile, temp, si->array->obj_size, + idx_pos); if (error) return error; + + *ptrp = temp; + return 0; } - memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), - si->array->obj_size); + /* Otherwise return a pointer to the array element in the folio. */ + *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos); return 0; } @@ -952,6 +926,8 @@ xfarray_sort( pivot = xfarray_sortinfo_pivot(si); while (si->stack_depth >= 0) { + int ret; + lo = si_lo[si->stack_depth]; hi = si_hi[si->stack_depth]; @@ -964,13 +940,13 @@ xfarray_sort( } /* - * If directly mapping the page and sorting can solve our + * If directly mapping the folio and sorting can solve our * problems, we're done. */ - if (xfarray_want_pagesort(si, lo, hi)) { - error = xfarray_pagesort(si, lo, hi); - if (error) - goto out_free; + ret = xfarray_foliosort(si, lo, hi); + if (ret < 0) + goto out_free; + if (ret == 1) { si->stack_depth--; continue; } @@ -995,25 +971,24 @@ xfarray_sort( * than the pivot is on the right side of the range. */ while (lo < hi) { + void *p; + /* * Decrement hi until it finds an a[hi] less than the * pivot value. */ - error = xfarray_sort_load_cached(si, hi, scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) { hi--; - error = xfarray_sort_load_cached(si, hi, - scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; @@ -1028,21 +1003,18 @@ xfarray_sort( * Increment lo until it finds an a[lo] greater than * the pivot value. */ - error = xfarray_sort_load_cached(si, lo, scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) { lo++; - error = xfarray_sort_load_cached(si, lo, - scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; @@ -1078,6 +1050,24 @@ xfarray_sort( out_free: trace_xfarray_sort_stats(si, error); + xfarray_sort_scan_done(si); kvfree(si); return error; } + +/* How many bytes is this array consuming? */ +unsigned long long +xfarray_bytes( + struct xfarray *array) +{ + return xfile_bytes(array->xfile); +} + +/* Empty the entire array. */ +void +xfarray_truncate( + struct xfarray *array) +{ + xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE); + array->nr = 0; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 4ecac01363d9..5eeeeed13ae2 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -8,6 +8,7 @@ /* xfile array index type, along with cursor initialization */ typedef uint64_t xfarray_idx_t; +#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL) #define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) /* Iterate each index of an xfile array. */ @@ -44,6 +45,27 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +void xfarray_truncate(struct xfarray *array); +unsigned long long xfarray_bytes(struct xfarray *array); + +/* + * Load an array element, but zero the buffer if there's no data because we + * haven't stored to that array element yet. + */ +static inline int +xfarray_load_sparse( + struct xfarray *array, + uint64_t idx, + void *rec) +{ + int error = xfarray_load(array, idx, rec); + + if (error == -ENODATA) { + memset(rec, 0, array->obj_size); + return 0; + } + return error; +} /* Append an element to the array. */ static inline int xfarray_append(struct xfarray *array, const void *ptr) @@ -54,6 +76,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr) uint64_t xfarray_length(struct xfarray *array); int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); +/* + * Iterate the non-null elements in a sparse xfarray. Callers should + * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it + * will be set to one more than the index of the record that was retrieved. + * Returns 1 if a record was retrieved, 0 if there weren't any more records, or + * a negative errno. + */ +static inline int +xfarray_iter( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + int ret = xfarray_load_next(array, idx, rec); + + if (ret == -ENODATA) + return 0; + if (ret == 0) + return 1; + return ret; +} + /* Declarations for xfile array sort functionality. */ typedef cmp_func_t xfarray_cmp_fn; @@ -83,9 +127,17 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; - /* Cache a page here for faster access. */ - struct xfile_page xfpage; - void *page_kaddr; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + + /* Cache a folio here for faster scanning for pivots */ + struct folio *folio; + + /* First array index in folio that is completely readable */ + xfarray_idx_t first_folio_idx; + + /* Last array index in folio that is completely readable */ + xfarray_idx_t last_folio_idx; #ifdef DEBUG /* Performance statistics. */ diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c new file mode 100644 index 000000000000..6ef2a9637f16 --- /dev/null +++ b/fs/xfs/scrub/xfblob.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" + +/* + * XFS Blob Storage + * ================ + * Stores and retrieves blobs using an xfile. Objects are appended to the file + * and the offset is returned as a magic cookie for retrieval. + */ + +#define XB_KEY_MAGIC 0xABAADDAD +struct xb_key { + uint32_t xb_magic; /* XB_KEY_MAGIC */ + uint32_t xb_size; /* size of the blob, in bytes */ + loff_t xb_offset; /* byte offset of this key */ + /* blob comes after here */ +} __packed; + +/* Initialize a blob storage object. */ +int +xfblob_create( + const char *description, + struct xfblob **blobp) +{ + struct xfblob *blob; + struct xfile *xfile; + int error; + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS); + if (!blob) { + error = -ENOMEM; + goto out_xfile; + } + + blob->xfile = xfile; + blob->last_offset = PAGE_SIZE; + + *blobp = blob; + return 0; + +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy a blob storage object. */ +void +xfblob_destroy( + struct xfblob *blob) +{ + xfile_destroy(blob->xfile); + kfree(blob); +} + +/* Retrieve a blob. */ +int +xfblob_load( + struct xfblob *blob, + xfblob_cookie cookie, + void *ptr, + uint32_t size) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + if (size < key.xb_size) { + ASSERT(0); + return -EFBIG; + } + + return xfile_load(blob->xfile, ptr, key.xb_size, + cookie + sizeof(key)); +} + +/* Store a blob. */ +int +xfblob_store( + struct xfblob *blob, + xfblob_cookie *cookie, + const void *ptr, + uint32_t size) +{ + struct xb_key key = { + .xb_offset = blob->last_offset, + .xb_magic = XB_KEY_MAGIC, + .xb_size = size, + }; + loff_t pos = blob->last_offset; + int error; + + error = xfile_store(blob->xfile, &key, sizeof(key), pos); + if (error) + return error; + + pos += sizeof(key); + error = xfile_store(blob->xfile, ptr, size, pos); + if (error) + goto out_err; + + *cookie = blob->last_offset; + blob->last_offset += sizeof(key) + size; + return 0; +out_err: + xfile_discard(blob->xfile, blob->last_offset, sizeof(key)); + return error; +} + +/* Free a blob. */ +int +xfblob_free( + struct xfblob *blob, + xfblob_cookie cookie) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + + xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size); + return 0; +} + +/* How many bytes is this blob storage object consuming? */ +unsigned long long +xfblob_bytes( + struct xfblob *blob) +{ + return xfile_bytes(blob->xfile); +} + +/* Drop all the blobs. */ +void +xfblob_truncate( + struct xfblob *blob) +{ + xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE); + blob->last_offset = PAGE_SIZE; +} diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h new file mode 100644 index 000000000000..ae78322613ca --- /dev/null +++ b/fs/xfs/scrub/xfblob.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFBLOB_H__ +#define __XFS_SCRUB_XFBLOB_H__ + +struct xfblob { + struct xfile *xfile; + loff_t last_offset; +}; + +typedef loff_t xfblob_cookie; + +int xfblob_create(const char *descr, struct xfblob **blobp); +void xfblob_destroy(struct xfblob *blob); +int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr, + uint32_t size); +int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr, + uint32_t size); +int xfblob_free(struct xfblob *blob, xfblob_cookie cookie); +unsigned long long xfblob_bytes(struct xfblob *blob); +void xfblob_truncate(struct xfblob *blob); + +static inline int +xfblob_storename( + struct xfblob *blob, + xfblob_cookie *cookie, + const struct xfs_name *xname) +{ + return xfblob_store(blob, cookie, xname->name, xname->len); +} + +static inline int +xfblob_loadname( + struct xfblob *blob, + xfblob_cookie cookie, + struct xfs_name *xname, + uint32_t size) +{ + int ret = xfblob_load(blob, cookie, (void *)xname->name, size); + if (ret) + return ret; + + xname->len = size; + return 0; +} + +#endif /* __XFS_SCRUB_XFBLOB_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 090c3ead43fd..c753c79df203 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,9 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" #include <linux/shmem_fs.h> @@ -34,13 +34,6 @@ * xfiles assume that the caller will handle all required concurrency * management; standard vfs locks (freezer and inode) are not taken. Reads * and writes are satisfied directly from the page cache. - * - * NOTE: The current shmemfs implementation has a quirk that in-kernel reads - * of a hole cause a page to be mapped into the file. If you are going to - * create a sparse xfile, please be careful about reading from uninitialized - * parts of the file. These pages are !Uptodate and will eventually be - * reclaimed if not written, but in the short term this boosts memory - * consumption. */ /* @@ -62,38 +55,27 @@ xfile_create( { struct inode *inode; struct xfile *xf; - int error = -ENOMEM; + int error; xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); if (!xf) return -ENOMEM; - xf->file = shmem_file_setup(description, isize, 0); - if (!xf->file) - goto out_xfile; + xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE); if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; } - /* - * We want a large sparse file that we can pread, pwrite, and seek. - * xfile users are responsible for keeping the xfile hidden away from - * all other callers, so we skip timestamp updates and security checks. - * Make the inode only accessible by root, just in case the xfile ever - * escapes. - */ - xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | - FMODE_LSEEK; - xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; inode = file_inode(xf->file); - inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; - inode->i_mode &= ~0177; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + trace_xfile_create(xf); *xfilep = xf; @@ -118,164 +100,128 @@ xfile_destroy( } /* - * Read a memory object directly from the xfile's page cache. Unlike regular - * pread, we return -E2BIG and -EFBIG for reads that are too large or at too - * high an offset, instead of truncating the read. Otherwise, we return - * bytes read or an error code, like regular pread. + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pread( +int +xfile_load( struct xfile *xf, void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - struct page *page = NULL; - ssize_t read = 0; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pread(xf, pos, count); + trace_xfile_load(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { - void *p, *kaddr; + struct folio *folio; unsigned int len; + unsigned int offset; - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * In-kernel reads of a shmem file cause it to allocate a page - * if the mapping shows a hole. Therefore, if we hit ENOMEM - * we can continue by zeroing the caller's buffer. - */ - page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, - __GFP_NOWARN); - if (IS_ERR(page)) { - error = PTR_ERR(page); - if (error != -ENOMEM) - break; - - memset(buf, 0, len); - goto advance; - } - - if (PageUptodate(page)) { + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, + SGP_READ) < 0) + break; + if (!folio) { /* - * xfile pages must never be mapped into userspace, so - * we skip the dcache flush. + * No data stored at this offset, just zero the output + * buffer until the next page boundary. */ - kaddr = kmap_local_page(page); - p = kaddr + offset_in_page(pos); - memcpy(buf, p, len); - kunmap_local(kaddr); - } else { + len = min_t(ssize_t, count, + PAGE_SIZE - offset_in_page(pos)); memset(buf, 0, len); - } - put_page(page); + } else { + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + break; + } -advance: + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(buf, folio_address(folio) + offset, len); + + folio_unlock(folio); + folio_put(folio); + } count -= len; pos += len; buf += len; - read += len; } memalloc_nofs_restore(pflags); - if (read > 0) - return read; - return error; + if (count) + return -ENOMEM; + return 0; } /* - * Write a memory object directly to the xfile's page cache. Unlike regular - * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too - * high an offset, instead of truncating the write. Otherwise, we return - * bytes written or an error code, like regular pwrite. + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pwrite( +int +xfile_store( struct xfile *xf, const void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - ssize_t written = 0; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pwrite(xf, pos, count); + trace_xfile_store(xf, pos, count); + + /* + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. + */ + if (pos + count > i_size_read(inode)) + i_size_write(inode, pos + count); pflags = memalloc_nofs_save(); while (count > 0) { - void *fsdata = NULL; - void *p, *kaddr; + struct folio *folio; unsigned int len; - int ret; - - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. - * shmem doesn't support fs freeze, but lockdep doesn't know - * that and will trip over that. - */ - error = aops->write_begin(NULL, mapping, pos, len, &page, - &fsdata); - if (error) - break; + unsigned int offset; - /* - * xfile pages must never be mapped into userspace, so we skip - * the dcache flush. If the page is not uptodate, zero it - * before writing data. - */ - kaddr = kmap_local_page(page); - if (!PageUptodate(page)) { - memset(kaddr, 0, PAGE_SIZE); - SetPageUptodate(page); - } - p = kaddr + offset_in_page(pos); - memcpy(p, buf, len); - kunmap_local(kaddr); - - ret = aops->write_end(NULL, mapping, pos, len, len, page, - fsdata); - if (ret < 0) { - error = ret; + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, + SGP_CACHE) < 0) + break; + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); break; } - written += ret; - if (ret != len) - break; + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(folio_address(folio) + offset, buf, len); + + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); - count -= ret; - pos += ret; - buf += ret; + count -= len; + pos += len; + buf += len; } memalloc_nofs_restore(pflags); - if (written > 0) - return written; - return error; + if (count) + return -ENOMEM; + return 0; } /* Find the next written area in the xfile data for a given offset. */ @@ -291,129 +237,88 @@ xfile_seek_data( return ret; } -/* Query stat information for an xfile. */ -int -xfile_stat( - struct xfile *xf, - struct xfile_stat *statbuf) -{ - struct kstat ks; - int error; - - error = vfs_getattr_nosec(&xf->file->f_path, &ks, - STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); - if (error) - return error; - - statbuf->size = ks.size; - statbuf->bytes = ks.blocks << SECTOR_SHIFT; - return 0; -} - /* - * Grab the (locked) page for a memory object. The object cannot span a page - * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we - * cannot grab the page, or the usual negative errno. + * Grab the (locked) folio for a memory object. The object cannot span a folio + * boundary. Returns the locked folio if successful, NULL if there was no + * folio or it didn't cover the range requested, or an ERR_PTR on failure. */ -int -xfile_get_page( +struct folio * +xfile_get_folio( struct xfile *xf, loff_t pos, - unsigned int len, - struct xfile_page *xfpage) + size_t len, + unsigned int flags) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - void *fsdata = NULL; - loff_t key = round_down(pos, PAGE_SIZE); + struct folio *folio = NULL; unsigned int pflags; int error; if (inode->i_sb->s_maxbytes - pos < len) - return -ENOMEM; - if (len > PAGE_SIZE - offset_in_page(pos)) - return -ENOTBLK; + return ERR_PTR(-ENOMEM); - trace_xfile_get_page(xf, pos, len); - - pflags = memalloc_nofs_save(); + trace_xfile_get_folio(xf, pos, len); /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. shmem - * doesn't support fs freeze, but lockdep doesn't know that and will - * trip over that. + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. */ - error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, - &fsdata); + if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode)) + i_size_write(inode, pos + len); + + pflags = memalloc_nofs_save(); + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, + (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); + memalloc_nofs_restore(pflags); if (error) - goto out_pflags; + return ERR_PTR(error); - /* We got the page, so make sure we push out EOF. */ - if (i_size_read(inode) < pos + len) - i_size_write(inode, pos + len); + if (!folio) + return NULL; - /* - * If the page isn't up to date, fill it with zeroes before we hand it - * to the caller and make sure the backing store will hold on to them. - */ - if (!PageUptodate(page)) { - void *kaddr; + if (len > folio_size(folio) - offset_in_folio(folio, pos)) { + folio_unlock(folio); + folio_put(folio); + return NULL; + } - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); - kunmap_local(kaddr); - SetPageUptodate(page); + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-EIO); } /* - * Mark each page dirty so that the contents are written to some - * backing store when we drop this buffer, and take an extra reference - * to prevent the xfile page from being swapped or removed from the - * page cache by reclaim if the caller unlocks the page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfile_put_folio. */ - set_page_dirty(page); - get_page(page); - - xfpage->page = page; - xfpage->fsdata = fsdata; - xfpage->pos = key; -out_pflags: - memalloc_nofs_restore(pflags); - return error; + if (flags & XFILE_ALLOC) + folio_mark_dirty(folio); + return folio; } /* - * Release the (locked) page for a memory object. Returns 0 or a negative - * errno. + * Release the (locked) folio for a memory object. */ -int -xfile_put_page( +void +xfile_put_folio( struct xfile *xf, - struct xfile_page *xfpage) + struct folio *folio) { - struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - unsigned int pflags; - int ret; - - trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); + trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio)); - /* Give back the reference that we took in xfile_get_page. */ - put_page(xfpage->page); + folio_unlock(folio); + folio_put(folio); +} - pflags = memalloc_nofs_save(); - ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, - xfpage->page, xfpage->fsdata); - memalloc_nofs_restore(pflags); - memset(xfpage, 0, sizeof(struct xfile_page)); +/* Discard the page cache that's backing a range of the xfile. */ +void +xfile_discard( + struct xfile *xf, + loff_t pos, + u64 count) +{ + trace_xfile_discard(xf, pos, count); - if (ret < 0) - return ret; - if (ret != PAGE_SIZE) - return -EIO; - return 0; + shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1); } diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index d56643b0f429..cc2cc1714cd4 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -6,22 +6,6 @@ #ifndef __XFS_SCRUB_XFILE_H__ #define __XFS_SCRUB_XFILE_H__ -struct xfile_page { - struct page *page; - void *fsdata; - loff_t pos; -}; - -static inline bool xfile_page_cached(const struct xfile_page *xfpage) -{ - return xfpage->page != NULL; -} - -static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) -{ - return xfpage->page->index; -} - struct xfile { struct file *file; }; @@ -29,49 +13,23 @@ struct xfile { int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); void xfile_destroy(struct xfile *xf); -ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); -ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, +int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); +int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); -/* - * Load an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pread(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - -/* - * Store an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pwrite(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - +void xfile_discard(struct xfile *xf, loff_t pos, u64 count); loff_t xfile_seek_data(struct xfile *xf, loff_t pos); -struct xfile_stat { - loff_t size; - unsigned long long bytes; -}; +#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) -int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); +#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */ +struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, + unsigned int flags); +void xfile_put_folio(struct xfile *xf, struct folio *folio); -int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, - struct xfile_page *xbuf); -int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); +static inline unsigned long long xfile_bytes(struct xfile *xf) +{ + return file_inode(xf->file)->i_blocks << SECTOR_SHIFT; +} #endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index a39befa743ce..f17173b83e6f 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -7,9 +7,11 @@ #define __XFS_SCRUB_H__ #ifndef CONFIG_XFS_ONLINE_SCRUB -# define xfs_scrub_metadata(file, sm) (-ENOTTY) +# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY) +# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY) #else -int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); +int xfs_ioc_scrub_metadata(struct file *file, void __user *arg); +int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg); #endif /* CONFIG_XFS_ONLINE_SCRUB */ #endif /* __XFS_SCRUB_H__ */ |