diff options
Diffstat (limited to 'fs/xfs')
234 files changed, 28387 insertions, 6208 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fffd6fffdce0..ae0ca6858496 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -3,7 +3,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS - select LIBCRC32C + select CRC32 select FS_IOMAP help XFS is a high performance journaling filesystem which originated diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index dd692619bed5..5bf501cf8271 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -14,7 +14,9 @@ xfs-y += xfs_trace.o # build the libxfs code first xfs-y += $(addprefix libxfs/, \ + xfs_group.o \ xfs_ag.o \ + xfs_ag_resv.o \ xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ @@ -42,12 +44,15 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_buf.o \ xfs_inode_util.o \ xfs_log_rlimit.o \ - xfs_ag_resv.o \ + xfs_metadir.o \ + xfs_metafile.o \ xfs_parent.o \ xfs_rmap.o \ xfs_rmap_btree.o \ xfs_refcount.o \ xfs_refcount_btree.o \ + xfs_rtrefcount_btree.o \ + xfs_rtrmap_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_inode.o \ @@ -58,6 +63,8 @@ xfs-y += $(addprefix libxfs/, \ # xfs_rtbitmap is shared with libxfs xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs_rtbitmap.o \ + xfs_rtgroup.o \ + xfs_zones.o \ ) # highlevel code @@ -130,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_quotaops.o # xfs_rtbitmap is shared with libxfs -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_zone_alloc.o \ + xfs_zone_gc.o \ + xfs_zone_info.o \ + xfs_zone_space_resv.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o @@ -171,6 +182,7 @@ xfs-y += $(addprefix scrub/, \ inode.o \ iscan.o \ listxattr.o \ + metapath.o \ nlinks.o \ parent.o \ readdir.o \ @@ -186,7 +198,10 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rgsuper.o \ rtbitmap.o \ + rtrefcount.o \ + rtrmap.o \ rtsummary.o \ ) @@ -226,6 +241,8 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ + rtrefcount_repair.o \ + rtrmap_repair.o \ rtsummary_repair.o \ ) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5ca8d0106827..e6ba914f6d06 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -30,86 +30,7 @@ #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_icache.h" - - -/* - * Passive reference counting access wrappers to the perag structures. If the - * per-ag structure is to be freed, the freeing code is responsible for cleaning - * up objects with passive references before freeing the structure. This is - * things like cached buffers. - */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = xa_load(&mp->m_perags, agno); - if (pag) { - trace_xfs_perag_get(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) >= 0); - atomic_inc(&pag->pag_ref); - } - rcu_read_unlock(); - return pag; -} - -/* Get a passive reference to the given perag. */ -struct xfs_perag * -xfs_perag_hold( - struct xfs_perag *pag) -{ - ASSERT(atomic_read(&pag->pag_ref) > 0 || - atomic_read(&pag->pag_active_ref) > 0); - - trace_xfs_perag_hold(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - return pag; -} - -void -xfs_perag_put( - struct xfs_perag *pag) -{ - trace_xfs_perag_put(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) > 0); - atomic_dec(&pag->pag_ref); -} - -/* - * Active references for perag structures. This is for short term access to the - * per ag structures for walking trees or accessing state. If an AG is being - * shrunk or is offline, then this will fail to find that AG and return NULL - * instead. - */ -struct xfs_perag * -xfs_perag_grab( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = xa_load(&mp->m_perags, agno); - if (pag) { - trace_xfs_perag_grab(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - } - rcu_read_unlock(); - return pag; -} - -void -xfs_perag_rele( - struct xfs_perag *pag) -{ - trace_xfs_perag_rele(pag, _RET_IP_); - if (atomic_dec_and_test(&pag->pag_active_ref)) - wake_up(&pag->pag_active_wq); -} +#include "xfs_group.h" /* * xfs_initialize_perag_data @@ -184,6 +105,18 @@ out: return error; } +static void +xfs_perag_uninit( + struct xfs_group *xg) +{ +#ifdef __KERNEL__ + struct xfs_perag *pag = to_perag(xg); + + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_buf_cache_destroy(&pag->pag_bcache); +#endif +} + /* * Free up the per-ag resources within the specified AG range. */ @@ -196,22 +129,8 @@ xfs_free_perag_range( { xfs_agnumber_t agno; - for (agno = first_agno; agno < end_agno; agno++) { - struct xfs_perag *pag = xa_erase(&mp->m_perags, agno); - - ASSERT(pag); - XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); - xfs_defer_drain_free(&pag->pag_intents_drain); - - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_cache_destroy(&pag->pag_bcache); - - /* drop the mount's active reference */ - xfs_perag_rele(pag); - XFS_IS_CORRUPT(pag->pag_mount, - atomic_read(&pag->pag_active_ref) != 0); - kfree_rcu_mightsleep(pag); - } + for (agno = first_agno; agno < end_agno; agno++) + xfs_group_free(mp, agno, XG_TYPE_AG, xfs_perag_uninit); } /* Find the size of the AG, in blocks. */ @@ -273,6 +192,10 @@ xfs_agino_range( return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } +/* + * Update the perag of the previous tail AG if it has been changed during + * recovery (i.e. recovery of a growfs). + */ int xfs_update_last_ag_size( struct xfs_mount *mp, @@ -282,88 +205,88 @@ xfs_update_last_ag_size( if (!pag) return -EFSCORRUPTED; - pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1, - mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks); - __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, + prev_agcount - 1, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, &pag->agino_max); xfs_perag_rele(pag); return 0; } -int -xfs_initialize_perag( +static int +xfs_perag_alloc( struct xfs_mount *mp, - xfs_agnumber_t old_agcount, - xfs_agnumber_t new_agcount, - xfs_rfsblock_t dblocks, - xfs_agnumber_t *maxagi) + xfs_agnumber_t index, + xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks) { struct xfs_perag *pag; - xfs_agnumber_t index; int error; - for (index = old_agcount; index < new_agcount; index++) { - pag = kzalloc(sizeof(*pag), GFP_KERNEL); - if (!pag) { - error = -ENOMEM; - goto out_unwind_new_pags; - } - pag->pag_agno = index; - pag->pag_mount = mp; - - error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL); - if (error) { - WARN_ON_ONCE(error == -EBUSY); - goto out_free_pag; - } + pag = kzalloc(sizeof(*pag), GFP_KERNEL); + if (!pag) + return -ENOMEM; #ifdef __KERNEL__ - /* Place kernel structure only init below this point. */ - spin_lock_init(&pag->pag_ici_lock); - spin_lock_init(&pag->pagb_lock); - spin_lock_init(&pag->pag_state_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - xfs_defer_drain_init(&pag->pag_intents_drain); - init_waitqueue_head(&pag->pagb_wait); - init_waitqueue_head(&pag->pag_active_wq); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; - xfs_hooks_init(&pag->pag_rmap_update_hooks); + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); #endif /* __KERNEL__ */ - error = xfs_buf_cache_init(&pag->pag_bcache); - if (error) - goto out_remove_pag; - - /* Active ref owned by mount indicates AG is online. */ - atomic_set(&pag->pag_active_ref, 1); + error = xfs_buf_cache_init(&pag->pag_bcache); + if (error) + goto out_free_perag; - /* - * Pre-calculated geometry - */ - pag->block_count = __xfs_ag_block_count(mp, index, new_agcount, + /* + * Pre-calculated geometry + */ + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, index, agcount, dblocks); - pag->min_block = XFS_AGFL_BLOCK(mp); - __xfs_agino_range(mp, pag->block_count, &pag->agino_min, - &pag->agino_max); - } + pag_group(pag)->xg_min_gbno = XFS_AGFL_BLOCK(mp) + 1; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); - index = xfs_set_inode_alloc(mp, new_agcount); + error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG); + if (error) + goto out_buf_cache_destroy; - if (maxagi) - *maxagi = index; + return 0; + +out_buf_cache_destroy: + xfs_buf_cache_destroy(&pag->pag_bcache); +out_free_perag: + kfree(pag); + return error; +} +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, + xfs_rfsblock_t dblocks, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index; + int error; + + if (orig_agcount >= new_agcount) + return 0; + + for (index = orig_agcount; index < new_agcount; index++) { + error = xfs_perag_alloc(mp, index, new_agcount, dblocks); + if (error) + goto out_unwind_new_pags; + } + + *maxagi = xfs_set_inode_alloc(mp, new_agcount); mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_remove_pag: - xfs_defer_drain_free(&pag->pag_intents_drain); - pag = xa_erase(&mp->m_perags, index); -out_free_pag: - kfree(pag); out_unwind_new_pags: - xfs_free_perag_range(mp, old_agcount, index); + xfs_free_perag_range(mp, orig_agcount, index); return error; } @@ -378,7 +301,7 @@ xfs_get_aghdr_buf( struct xfs_buf *bp; int error; - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); if (error) return error; @@ -818,7 +741,7 @@ xfs_ag_shrink_space( struct xfs_trans **tpp, xfs_extlen_t delta) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -835,7 +758,7 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp); if (error) return error; @@ -872,7 +795,7 @@ xfs_ag_shrink_space( /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); + xfs_agbno_to_fsb(pag, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; @@ -931,9 +854,9 @@ xfs_ag_shrink_space( } /* Update perag geometry */ - pag->block_count -= delta; - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count -= delta; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); @@ -958,12 +881,13 @@ xfs_ag_extend_space( struct xfs_trans *tp, xfs_extlen_t len) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; int error; - ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, tp, 0, &bp); if (error) @@ -1002,9 +926,9 @@ xfs_ag_extend_space( return error; /* Update perag geometry */ - pag->block_count = be32_to_cpu(agf->agf_length); - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); return 0; } @@ -1031,7 +955,7 @@ xfs_ag_get_geometry( /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = pag->pag_agno; + ageo->ag_number = pag_agno(pag); agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 9edfe0e96439..1f24cfa27321 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -7,6 +7,8 @@ #ifndef __LIBXFS_AG_H #define __LIBXFS_AG_H 1 +#include "xfs_group.h" + struct xfs_mount; struct xfs_trans; struct xfs_perag; @@ -30,11 +32,7 @@ struct xfs_ag_resv { * performance of allocation group selection. */ struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* passive reference count */ - atomic_t pag_active_ref; /* active reference count */ - wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ + struct xfs_group pag_group; unsigned long pag_opstate; uint8_t pagf_bno_level; /* # of levels in bno btree */ uint8_t pagf_cnt_level; /* # of levels in cnt btree */ @@ -55,7 +53,6 @@ struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; - int pagb_count; /* pagb slots in use */ uint8_t pagf_refcount_level; /* recount btree height */ /* Blocks reserved for all kinds of metadata. */ @@ -64,21 +61,12 @@ struct xfs_perag { struct xfs_ag_resv pag_rmapbt_resv; /* Precalculated geometry info */ - xfs_agblock_t block_count; - xfs_agblock_t min_block; xfs_agino_t agino_min; xfs_agino_t agino_max; #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Bitsets of per-ag metadata that have been checked and/or are sick. - * Callers should hold pag_state_lock before accessing this field. - */ - uint16_t pag_checked; - uint16_t pag_sick; - #ifdef CONFIG_XFS_ONLINE_REPAIR /* * Alternate btree heights so that online repair won't trip the write @@ -90,13 +78,6 @@ struct xfs_perag { uint8_t pagf_repair_rmap_level; #endif - spinlock_t pag_state_lock; - - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - unsigned int pagb_gen; /* generation count for pagb_tree */ - wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ spinlock_t pag_ici_lock; /* incore inode cache lock */ @@ -108,21 +89,29 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - - /* - * We use xfs_drain to track the number of deferred log intent items - * that have been queued (but not yet processed) so that waiters (e.g. - * scrub) will not lock resources when other threads are in the middle - * of processing a chain of intent items only to find momentary - * inconsistencies. - */ - struct xfs_defer_drain pag_intents_drain; - - /* Hook to feed rmapbt updates to an active online repair. */ - struct xfs_hooks pag_rmap_update_hooks; #endif /* __KERNEL__ */ }; +static inline struct xfs_perag *to_perag(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_perag, pag_group); +} + +static inline struct xfs_group *pag_group(struct xfs_perag *pag) +{ + return &pag->pag_group; +} + +static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_mount; +} + +static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_gno; +} + /* * Per-AG operational state. These are atomic flag bits. */ @@ -144,8 +133,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) -int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount, - xfs_agnumber_t agcount, xfs_rfsblock_t dcount, +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno, xfs_agnumber_t end_agno); @@ -153,13 +142,71 @@ int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); /* Passive AG references */ -struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); -void xfs_perag_put(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + return to_perag(xfs_group_hold(pag_group(pag))); +} + +static inline void +xfs_perag_put( + struct xfs_perag *pag) +{ + xfs_group_put(pag_group(pag)); +} /* Active AG references */ -struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); -void xfs_perag_rele(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_grab( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG)); +} + +static inline void +xfs_perag_rele( + struct xfs_perag *pag) +{ + xfs_group_rele(pag_group(pag)); +} + +static inline struct xfs_perag * +xfs_perag_next_range( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno, + xfs_agnumber_t end_agno) +{ + return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL, + start_agno, end_agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_next_from( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno) +{ + return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1); +} + +static inline struct xfs_perag * +xfs_perag_next( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + return xfs_perag_next_from(mp, pag, 0); +} /* * Per-ag geometry infomation and validation @@ -171,11 +218,7 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, static inline bool xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) { - if (agbno >= pag->block_count) - return false; - if (agbno <= pag->min_block) - return false; - return true; + return xfs_verify_gbno(pag_group(pag), agbno); } static inline bool @@ -184,13 +227,7 @@ xfs_verify_agbext( xfs_agblock_t agbno, xfs_agblock_t len) { - if (agbno + len <= agbno) - return false; - - if (!xfs_verify_agbno(pag, agbno)) - return false; - - return xfs_verify_agbno(pag, agbno + len - 1); + return xfs_verify_gbext(pag_group(pag), agbno, len); } /* @@ -226,40 +263,6 @@ xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); } -/* - * Perag iteration APIs - */ -static inline struct xfs_perag * -xfs_perag_next( - struct xfs_perag *pag, - xfs_agnumber_t *agno, - xfs_agnumber_t end_agno) -{ - struct xfs_mount *mp = pag->pag_mount; - - *agno = pag->pag_agno + 1; - xfs_perag_rele(pag); - while (*agno <= end_agno) { - pag = xfs_perag_grab(mp, *agno); - if (pag) - return pag; - (*agno)++; - } - return NULL; -} - -#define for_each_perag_range(mp, agno, end_agno, pag) \ - for ((pag) = xfs_perag_grab((mp), (agno)); \ - (pag) != NULL; \ - (pag) = xfs_perag_next((pag), &(agno), (end_agno))) - -#define for_each_perag_from(mp, agno, pag) \ - for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) - -#define for_each_perag(mp, agno, pag) \ - (agno) = 0; \ - for_each_perag_from((mp), (agno), (pag)) - static inline struct xfs_perag * xfs_perag_next_wrap( struct xfs_perag *pag, @@ -268,9 +271,9 @@ xfs_perag_next_wrap( xfs_agnumber_t restart_agno, xfs_agnumber_t wrap_agno) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); - *agno = pag->pag_agno + 1; + *agno = pag_agno(pag) + 1; xfs_perag_rele(pag); while (*agno != stop_agno) { if (*agno >= wrap_agno) { @@ -332,4 +335,28 @@ int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +static inline xfs_fsblock_t +xfs_agbno_to_fsb( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_daddr_t +xfs_agbno_to_daddr( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_ino_t +xfs_agino_to_ino( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino); +} + #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 216423df939e..fb79215a509d 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -70,6 +70,7 @@ xfs_ag_resv_critical( struct xfs_perag *pag, enum xfs_ag_resv_type type) { + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t avail; xfs_extlen_t orig; @@ -92,8 +93,8 @@ xfs_ag_resv_critical( /* Critically low if less than 10% or max btree height remains. */ return XFS_TEST_ERROR(avail < orig / 10 || - avail < pag->pag_mount->m_agbtree_maxlevels, - pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); + avail < mp->m_agbtree_maxlevels, + mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -113,6 +114,7 @@ xfs_ag_resv_needed( case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; + case XFS_AG_RESV_METAFILE: case XFS_AG_RESV_NONE: /* empty */ break; @@ -137,8 +139,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - if (pag->pag_agno == 0) - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag_agno(pag) == 0) + pag_mount(pag)->m_ag_max_usable += resv->ar_asked; /* * RMAPBT blocks come from the AGFL and AGFL blocks are always * considered "free", so whatever was reserved at mount time must be @@ -148,7 +150,7 @@ __xfs_ag_resv_free( oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; - xfs_add_fdblocks(pag->pag_mount, oldresv); + xfs_add_fdblocks(pag_mount(pag), oldresv); resv->ar_reserved = 0; resv->ar_asked = 0; resv->ar_orig_reserved = 0; @@ -170,7 +172,7 @@ __xfs_ag_resv_init( xfs_extlen_t ask, xfs_extlen_t used) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_ag_resv *resv; int error; xfs_extlen_t hidden_space; @@ -206,11 +208,10 @@ __xfs_ag_resv_init( else error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); + trace_xfs_ag_resv_init_error(pag, error, _RET_IP_); xfs_warn(mp, "Per-AG reservation for AG %u failed. Filesystem may run out of space.", - pag->pag_agno); + pag_agno(pag)); return error; } @@ -220,7 +221,7 @@ __xfs_ag_resv_init( * counter, we only make the adjustment for AG 0. This assumes that * there aren't any AGs hungrier for per-AG reservation than AG 0. */ - if (pag->pag_agno == 0) + if (pag_agno(pag) == 0) mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); @@ -238,7 +239,7 @@ xfs_ag_resv_init( struct xfs_perag *pag, struct xfs_trans *tp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t ask; xfs_extlen_t used; int error = 0, error2; @@ -347,6 +348,7 @@ xfs_ag_resv_alloc_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: @@ -389,6 +391,7 @@ xfs_ag_resv_free_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 04f64cf9777e..7839efe050bf 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; -#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) - #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 @@ -275,7 +273,7 @@ xfs_alloc_complain_bad_rec( xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", - cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); @@ -303,7 +301,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -331,7 +329,8 @@ xfs_alloc_compute_aligned( bool busy; /* Trim busy sections out of found extent */ - busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); + busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen, + args->maxlen, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -409,8 +408,8 @@ xfs_alloc_compute_diff( if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { if (newlen1 < newlen2 || (newlen1 == newlen2 && - XFS_ABSDIFF(newbno1, wantbno) > - XFS_ABSDIFF(newbno2, wantbno))) + abs_diff(newbno1, wantbno) > + abs_diff(newbno2, wantbno))) newbno1 = newbno2; } else if (newbno2 != NULLAGBLOCK) newbno1 = newbno2; @@ -426,7 +425,7 @@ xfs_alloc_compute_diff( } else newbno1 = freeend - wantlen; *newbnop = newbno1; - return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); + return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno); } /* @@ -539,7 +538,7 @@ static int xfs_alloc_fixup_longest( struct xfs_btree_cur *cnt_cur) { - struct xfs_perag *pag = cnt_cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cnt_cur->bc_group); struct xfs_buf *bp = cnt_cur->bc_ag.agbp; struct xfs_agf *agf = bp->b_addr; xfs_extlen_t longest = 0; @@ -799,7 +798,7 @@ xfs_agfl_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag))) return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { @@ -879,13 +878,12 @@ xfs_alloc_read_agfl( struct xfs_trans *tp, struct xfs_buf **bpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; int error; - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); @@ -1252,14 +1250,14 @@ xfs_alloc_ag_vextent_small( if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, + xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, - XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + xfs_agbno_to_daddr(args->pag, fbno), args->mp->m_bsize, 0, &bp); if (error) goto error; @@ -1365,7 +1363,8 @@ xfs_alloc_ag_vextent_exact( */ tbno = fbno; tlen = flen; - xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); + xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen, + &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1758,8 +1757,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_near_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - acur.busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), acur.busy_gen, + alloc_flags); if (error) goto out; @@ -1874,8 +1874,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -1923,7 +1924,7 @@ restart: error = -EFSCORRUPTED; goto error0; } - if (flen < bestrlen) + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1973,8 +1974,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -2037,7 +2039,6 @@ int xfs_free_ag_extent( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, @@ -2358,19 +2359,19 @@ xfs_free_ag_extent( * Update the freespace totals in the ag and superblock. */ error = xfs_alloc_update_counters(tp, agbp, len); - xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); if (error) goto error0; XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); + trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); + trace_xfs_free_extent(pag, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2429,7 +2430,7 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, + return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable, pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ @@ -2612,7 +2613,7 @@ xfs_agfl_reset( xfs_warn(mp, "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " "Please unmount and run xfs_repair.", - pag->pag_agno, pag->pagf_flcount); + pag_agno(pag), pag->pagf_flcount); agf->agf_flfirst = 0; agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); @@ -2645,8 +2646,17 @@ xfs_defer_extent_free( ASSERT(!isnullstartblock(bno)); ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) - return -EFSCORRUPTED; + if (free_flags & XFS_FREE_EXTENT_REALTIME) { + if (type != XFS_AG_RESV_NONE) { + ASSERT(type == XFS_AG_RESV_NONE); + return -EFSCORRUPTED; + } + if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len))) + return -EFSCORRUPTED; + } else { + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + } xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -2655,6 +2665,8 @@ xfs_defer_extent_free( xefi->xefi_agresv = type; if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (free_flags & XFS_FREE_EXTENT_REALTIME) + xefi->xefi_flags |= XFS_EFI_REALTIME; if (oinfo) { ASSERT(oinfo->oi_offset == 0); @@ -2934,9 +2946,8 @@ xfs_alloc_fix_freelist( * Deferring the free disconnects freeing up the AGFL slot from * freeing the block. */ - error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, args->agno, bno), 1, - &targs.oinfo, XFS_AG_RESV_AGFL, 0); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno), + 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0); if (error) goto out_agbp_relse; } @@ -3156,8 +3167,6 @@ xfs_alloc_put_freelist( logflags |= XFS_AGF_BTREEBLKS; } - xfs_alloc_log_agf(tp, agbp, logflags); - ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); agfl_bno = xfs_buf_to_agfl_bno(agflbp); @@ -3190,7 +3199,7 @@ xfs_validate_ag_length( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && seqno != bp->b_pag->pag_agno) + if (bp->b_pag && seqno != pag_agno(bp->b_pag)) return __this_address; /* @@ -3359,13 +3368,13 @@ xfs_read_agf( int flags, struct xfs_buf **agfbpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agf(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); @@ -3388,12 +3397,13 @@ xfs_alloc_read_agf( int flags, struct xfs_buf **agfbpp) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *agfbp; struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_alloc_read_agf(pag); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != @@ -3414,7 +3424,7 @@ xfs_alloc_read_agf( pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - if (xfs_agfl_needs_reset(pag->pag_mount, agf)) + if (xfs_agfl_needs_reset(mp, agf)) set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); else clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); @@ -3427,16 +3437,15 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(pag->pag_mount)) + if (xfs_has_rmapbt(mp)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, - &pag->pag_mount->m_allocbt_blks); + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } #ifdef DEBUG - else if (!xfs_is_shutdown(pag->pag_mount)) { + else if (!xfs_is_shutdown(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3597,7 +3606,7 @@ xfs_alloc_vextent_finish( goto out_drop_perag; } - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno); ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); @@ -3618,8 +3627,8 @@ xfs_alloc_vextent_finish( if (error) goto out_drop_perag; - ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, - args->len)); + ASSERT(!xfs_extent_busy_search(pag_group(args->pag), + args->agbno, args->len)); } xfs_ag_resv_alloc_extent(args->pag, args->resv, args); @@ -3649,21 +3658,20 @@ xfs_alloc_vextent_this_ag( struct xfs_alloc_arg *args, xfs_agnumber_t agno) { - struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == agno); + ASSERT(pag_agno(args->pag) == agno); args->agno = agno; args->agbno = 0; trace_xfs_alloc_vextent_this_ag(args); - error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), - &minimum_agno); + error = xfs_alloc_vextent_check_args(args, + xfs_agbno_to_fsb(args->pag, 0), &minimum_agno); if (error) { if (error == -ENOSPC) return 0; @@ -3868,7 +3876,7 @@ xfs_alloc_vextent_exact_bno( int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3907,7 +3915,7 @@ xfs_alloc_vextent_near_bno( int error; if (!needs_perag) - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3944,7 +3952,7 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = pag->pag_agno; + args.agno = pag_agno(pag); args.pag = pag; /* @@ -4012,14 +4020,13 @@ __xfs_free_extent( goto err_release; } - error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, - type); + error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags); return 0; err_release: @@ -4044,7 +4051,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0165452e7cd0..50ef79a1ed41 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -79,9 +79,8 @@ int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agfbp, struct xfs_buf *agflbp, xfs_agblock_t bno, int btreeblk); int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, - xfs_extlen_t len, const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type); + xfs_agblock_t bno, xfs_extlen_t len, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); /* * Compute and fill in value of m_alloc_maxlevels. @@ -238,7 +237,11 @@ int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, /* Don't issue a discard for the blocks freed. */ #define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0) -#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD) +/* Free blocks on the realtime device. */ +#define XFS_FREE_EXTENT_REALTIME (1U << 1) + +#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD | \ + XFS_FREE_EXTENT_REALTIME) /* * List of extents to be free "later". @@ -249,7 +252,7 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ - struct xfs_perag *xefi_pag; + struct xfs_group *xefi_group; unsigned int xefi_flags; enum xfs_ag_resv_type xefi_agresv; }; @@ -258,6 +261,12 @@ struct xfs_extent_free_item { #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ #define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ +#define XFS_EFI_REALTIME (1U << 4) /* freeing realtime extent */ + +static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi) +{ + return xefi->xefi_flags & XFS_EFI_REALTIME; +} struct xfs_alloc_autoreap { struct xfs_defer_pending *dfp; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index aada676eee51..a4ac37ba5d51 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -28,7 +28,7 @@ xfs_bnobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, - cur->bc_ag.pag); + to_perag(cur->bc_group)); } STATIC struct xfs_btree_cur * @@ -36,29 +36,29 @@ xfs_cntbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, - cur->bc_ag.pag); + to_perag(cur->bc_group)); } - STATIC void xfs_allocbt_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; ASSERT(ptr->s != 0); if (xfs_btree_is_bno(cur->bc_ops)) { agf->agf_bno_root = ptr->s; be32_add_cpu(&agf->agf_bno_level, inc); - cur->bc_ag.pag->pagf_bno_level += inc; + pag->pagf_bno_level += inc; } else { agf->agf_cnt_root = ptr->s; be32_add_cpu(&agf->agf_cnt_level, inc); - cur->bc_ag.pag->pagf_cnt_level += inc; + pag->pagf_cnt_level += inc; } xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); @@ -75,7 +75,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + error = xfs_alloc_get_freelist(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -86,7 +86,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_group, bno, 1, false); new->s = cpu_to_be32(bno); @@ -104,13 +104,13 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, - bno, 1); + error = xfs_alloc_put_freelist(to_perag(cur->bc_group), cur->bc_tp, + agbp, NULL, bno, 1); if (error) return error; atomic64_dec(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(agbp->b_pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; } @@ -178,7 +178,7 @@ xfs_allocbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); if (xfs_btree_is_bno(cur->bc_ops)) ptr->s = agf->agf_bno_root; @@ -492,7 +492,7 @@ xfs_bnobt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -518,7 +518,7 @@ xfs_cntbt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index c63da14eee04..8c04acd30d48 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1004,7 +1004,8 @@ xfs_attr_add_fork( unsigned int blks; /* space reservation */ int error; /* error return value */ - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + if (!xfs_is_metadir_inode(ip)) + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); blks = XFS_ADDAFORK_SPACE_RES(mp); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 36dd08d13293..d954f9b8071f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,12 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -170,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -614,7 +613,7 @@ xfs_bmap_btree_to_extents( xfs_trans_binval(tp, cbp); if (cur->bc_levels[0].bp == cbp) cur->bc_levels[0].bp = NULL; - xfs_iroot_realloc(ip, -1, whichfork); + xfs_bmap_broot_realloc(ip, whichfork, 0); ASSERT(ifp->if_broot == NULL); ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); @@ -658,12 +657,11 @@ xfs_bmap_extents_to_btree( * Make space in the inode incore. This needs to be undone if we fail * to expand the root. */ - xfs_iroot_realloc(ip, 1, whichfork); + block = xfs_bmap_broot_realloc(ip, whichfork, 1); /* * Fill in the root. */ - block = ifp->if_broot; xfs_bmbt_init_block(ip, block, NULL, 1, 1); /* * Need a cursor. Can't allocate until bb_level is filled in. @@ -745,7 +743,7 @@ xfs_bmap_extents_to_btree( out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: - xfs_iroot_realloc(ip, -1, whichfork); + xfs_bmap_broot_realloc(ip, whichfork, 0); ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -1042,7 +1040,8 @@ xfs_bmap_add_attrfork( int error; /* error return value */ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + if (!xfs_is_metadir_inode(ip)) + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1423,6 +1422,24 @@ xfs_bmap_last_offset( * Extent tree manipulation functions used during allocation. */ +static inline bool +xfs_bmap_same_rtgroup( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *left, + struct xfs_bmbt_irec *right) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, left->br_startblock) != + xfs_rtb_to_rgno(mp, right->br_startblock)) + return false; + } + + return true; +} + /* * Convert a delayed allocation to a real allocation. */ @@ -1492,7 +1509,8 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -1516,7 +1534,8 @@ xfs_bmap_add_extent_delay_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2061,7 +2080,8 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -2085,7 +2105,8 @@ xfs_bmap_add_extent_unwritten_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; /* @@ -2549,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_add_fdblocks(ip->i_mount, oldlen - newlen); - - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -2745,7 +2626,8 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &left, new)) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && @@ -2755,7 +2637,8 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &right)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -3121,8 +3004,15 @@ xfs_bmap_adjacent_valid( struct xfs_mount *mp = ap->ip->i_mount; if (XFS_IS_REALTIME_INODE(ap->ip) && - (ap->datatype & XFS_ALLOC_USERDATA)) - return x < mp->m_sb.sb_rblocks; + (ap->datatype & XFS_ALLOC_USERDATA)) { + if (!xfs_has_rtgroups(mp)) + return x < mp->m_sb.sb_rblocks; + + return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) && + xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount && + xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents; + + } return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && @@ -3280,7 +3170,7 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(pag->pag_mount, pag), + xfs_alloc_min_freelist(pag_mount(pag), pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3422,6 +3312,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, @@ -3531,12 +3426,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3704,7 +3599,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for @@ -4006,144 +3902,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - uint64_t fdblocks; - int error; - xfs_fileoff_t aoff; - bool use_cowextszhint = - whichfork == XFS_COW_FORK && !prealloc; - -retry: - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - aoff = off; - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* - * If we're targetting the COW fork but aren't creating a speculative - * posteof preallocation, try to expand the reservation to align with - * the COW extent size hint if there's sufficient free space. - * - * Unlike the data fork, the CoW cancellation functions will free all - * the reservations at inactivation, so we don't require that every - * delalloc reservation have a dirty pagecache. - */ - if (use_cowextszhint) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - goto out; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - fdblocks = indlen; - if (XFS_IS_REALTIME_INODE(ip)) { - error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen)); - if (error) - goto out_unreserve_quota; - } else { - fdblocks += alen; - } - - error = xfs_dec_fdblocks(mp, fdblocks, false); - if (error) - goto out_unreserve_frextents; - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip, alen, indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_frextents: - if (XFS_IS_REALTIME_INODE(ip)) - xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen)); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); -out: - if (error == -ENOSPC || error == -EDQUOT) { - trace_xfs_delalloc_enospc(ip, off, len); - - if (prealloc || use_cowextszhint) { - /* retry without any preallocation */ - use_cowextszhint = false; - prealloc = 0; - goto retry; - } - } - return error; -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4532,8 +4290,9 @@ xfs_bmapi_write( * the refcount btree for orphan recovery. */ if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); + xfs_refcount_alloc_cow_extent(tp, + XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); } /* Deal with the allocated space we found. */ @@ -4708,7 +4467,8 @@ xfs_bmapi_convert_one_delalloc( *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); + xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4913,7 +4673,8 @@ xfs_bmap_del_extent_delay( int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -5033,10 +4794,18 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; fdblocks = da_diff; - if (isrt) - xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); - else + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { fdblocks += del->br_blockcount; + } xfs_add_fdblocks(mp, fdblocks); xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); @@ -5113,6 +4882,34 @@ xfs_bmap_del_extent_cow( ip->i_delayed_blks -= del->br_blockcount; } +static int +xfs_bmap_free_rtblocks( + struct xfs_trans *tp, + struct xfs_bmbt_irec *del) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = xfs_rtgroup_grab(tp->t_mountp, 0); + if (!rtg) + return -EIO; + + /* + * Ensure the bitmap and summary inodes are locked and joined to the + * transaction before modifying them. + */ + if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { + tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP); + } + + error = xfs_rtfree_blocks(tp, rtg, del->br_startblock, + del->br_blockcount); + xfs_rtgroup_rele(rtg); + return error; +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space. @@ -5325,20 +5122,12 @@ xfs_bmap_del_extent_real( * If we need to, add to list of extents to delete. */ if (!(bflags & XFS_BMAPI_REMAP)) { + bool isrt = xfs_ifork_is_realtime(ip, whichfork); + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - xfs_refcount_decrease_extent(tp, del); - } else if (xfs_ifork_is_realtime(ip, whichfork)) { - /* - * Ensure the bitmap and summary inodes are locked - * and joined to the transaction before modifying them. - */ - if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { - tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; - xfs_rtbitmap_lock(mp); - xfs_rtbitmap_trans_join(tp); - } - error = xfs_rtfree_blocks(tp, del->br_startblock, - del->br_blockcount); + xfs_refcount_decrease_extent(tp, isrt, del); + } else if (isrt && !xfs_has_rtgroups(mp)) { + error = xfs_bmap_free_rtblocks(tp, del); } else { unsigned int efi_flags = 0; @@ -5346,6 +5135,19 @@ xfs_bmap_del_extent_real( del->br_state == XFS_EXT_UNWRITTEN) efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD; + /* + * Historically, we did not use EFIs to free realtime + * extents. However, when reverse mapping is enabled, + * we must maintain the same order of operations as the + * data device, which is: Remove the file mapping, + * remove the reverse mapping, and then free the + * blocks. Reflink for realtime volumes requires the + * same sort of ordering. Both features rely on + * rtgroups, so let's gate rt EFI usage on rtgroups. + */ + if (isrt) + efi_flags |= XFS_FREE_EXTENT_REALTIME; + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, efi_flags); @@ -5602,7 +5404,8 @@ __xfs_bunmapi( delete: if (wasdel) { - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, @@ -5694,6 +5497,8 @@ xfs_bunmapi( */ STATIC bool xfs_bmse_can_merge( + struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_bmbt_irec *got, /* current extent to shift */ xfs_fileoff_t shift) /* shift fsb */ @@ -5709,7 +5514,8 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) || + !xfs_bmap_same_rtgroup(ip, whichfork, left, got)) return false; return true; @@ -5745,7 +5551,7 @@ xfs_bmse_merge( blockcount = left->br_blockcount + got->br_blockcount; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - ASSERT(xfs_bmse_can_merge(left, got, shift)); + ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift)); new = *left; new.br_blockcount = blockcount; @@ -5907,7 +5713,8 @@ xfs_bmap_collapse_extents( goto del_cursor; } - if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(ip, whichfork, &prev, &got, + offset_shift_fsb)) { error = xfs_bmse_merge(tp, ip, whichfork, offset_shift_fsb, &icur, &got, &prev, cur, &logflags); @@ -6043,7 +5850,8 @@ xfs_bmap_insert_extents( * never find mergeable extents in this scenario. Check anyways * and warn if we encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb)) + if (xfs_bmse_can_merge(ip, whichfork, &got, &next, + offset_shift_fsb)) WARN_ON_ONCE(1); } @@ -6428,9 +6236,8 @@ xfs_get_extsz_hint( * No point in aligning allocations if we need to COW to actually * write to them. */ - if (xfs_is_always_cow_inode(ip)) - return 0; - if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) + if (!xfs_is_always_cow_inode(ip) && + (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) return ip->i_extsize; if (XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1) @@ -6453,7 +6260,13 @@ xfs_get_cowextsz_hint( a = 0; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) a = ip->i_cowextsize; - b = xfs_get_extsz_hint(ip); + if (XFS_IS_REALTIME_INODE(ip)) { + b = 0; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + b = ip->i_extsize; + } else { + b = xfs_get_extsz_hint(ip); + } a = max(a, b); if (a == 0) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 7592d46e97c6..d5f2729305fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) @@ -204,7 +208,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -219,10 +223,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -233,6 +233,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, @@ -248,7 +249,7 @@ struct xfs_bmap_intent { enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; - struct xfs_perag *bi_pag; + struct xfs_group *bi_group; struct xfs_bmbt_irec bi_bmap; }; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 3464be771f95..908d7b050e9c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -516,6 +516,116 @@ xfs_bmbt_keys_contiguous( be64_to_cpu(key2->bmbt.br_startoff)); } +static inline void +xfs_bmbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Reallocate the space for if_broot based on the number of records. Move the + * records and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by the caller. + * When growing this will create holes to be filled in by the caller. + * + * The caller must not request to add more records than would fit in the + * on-disk inode root. If the if_broot is currently NULL, then if we are + * adding records, one will be allocated. The caller must also not request + * that the number of records go below zero, although it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * whichfork -- which inode fork to change + * new_numrecs -- the new number of records requested for the if_broot array + * + * Returns the incore btree root block. + */ +struct xfs_btree_block * +xfs_bmap_broot_realloc( + struct xfs_inode *ip, + int whichfork, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + + /* + * Block mapping btrees do not support storing zero records; if this + * happens, the fork is being changed to FMT_EXTENTS. Free the broot + * and get out. + */ + if (new_numrecs == 0) + return xfs_broot_realloc(ifp, 0); + + new_size = xfs_bmap_broot_space_calc(mp, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false); + broot = xfs_broot_realloc(ifp, new_size); + ASSERT(xfs_bmap_bmdr_space(broot) <= + xfs_inode_fork_size(ip, whichfork)); + xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs); + return broot; + } + + /* + * We're reducing, but not totally eliminating, numrecs. In this case, + * we are shrinking the if_broot buffer, so it must already exist. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0); + + /* + * Shrink the btree root by moving the bmbt pointers, since they are + * not butted up against the btree block header, then reallocating + * broot. + */ + xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + ASSERT(xfs_bmap_bmdr_space(broot) <= + xfs_inode_fork_size(ip, whichfork)); + return broot; +} + +static struct xfs_btree_block * +xfs_bmbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork, + new_numrecs); +} + const struct xfs_btree_ops xfs_bmbt_ops = { .name = "bmap", .type = XFS_BTREE_TYPE_INODE, @@ -543,6 +653,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, .keys_contiguous = xfs_bmbt_keys_contiguous, + .broot_realloc = xfs_bmbt_broot_realloc, }; /* diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 49a3bae3f6ec..b238d559ab03 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -198,4 +198,7 @@ xfs_bmap_bmdr_space(struct xfs_btree_block *bb) return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs)); } +struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip, + int whichfork, unsigned int new_numrecs); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a5c4af148853..299ce7fd11b0 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -30,6 +30,12 @@ #include "xfs_health.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_quota.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" /* * Btree magic numbers. @@ -225,7 +231,7 @@ __xfs_btree_check_agblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_failaddr_t fa; xfs_agblock_t agbno; @@ -331,7 +337,7 @@ __xfs_btree_check_ptr( return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_AG: - if (!xfs_verify_agbno(cur->bc_ag.pag, + if (!xfs_verify_agbno(to_perag(cur->bc_group), be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; break; @@ -372,7 +378,7 @@ xfs_btree_check_ptr( case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", - cur->bc_ag.pag->pag_agno, cur->bc_ops->name, + cur->bc_group->xg_gno, cur->bc_ops->name, level, index); break; } @@ -523,20 +529,8 @@ xfs_btree_del_cursor( ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); - switch (cur->bc_ops->type) { - case XFS_BTREE_TYPE_AG: - if (cur->bc_ag.pag) - xfs_perag_put(cur->bc_ag.pag); - break; - case XFS_BTREE_TYPE_INODE: - /* nothing to do */ - break; - case XFS_BTREE_TYPE_MEM: - if (cur->bc_mem.pag) - xfs_perag_put(cur->bc_mem.pag); - break; - } - + if (cur->bc_group) + xfs_group_put(cur->bc_group); kmem_cache_free(cur->bc_cache, cur); } @@ -1017,22 +1011,22 @@ xfs_btree_readahead_agblock( struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, left), - mp->m_bsize, cur->bc_ops->buf_ops); + xfs_agbno_to_daddr(pag, left), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, right), - mp->m_bsize, cur->bc_ops->buf_ops); + xfs_agbno_to_daddr(pag, right), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } @@ -1091,7 +1085,7 @@ xfs_btree_ptr_to_daddr( switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, + *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), be32_to_cpu(ptr->s)); break; case XFS_BTREE_TYPE_INODE: @@ -1313,7 +1307,7 @@ xfs_btree_owner( case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; case XFS_BTREE_TYPE_AG: - return cur->bc_ag.pag->pag_agno; + return cur->bc_group->xg_gno; default: ASSERT(0); return 0; @@ -1549,12 +1543,16 @@ xfs_btree_log_recs( int first, int last) { + if (!bp) { + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); + return; + } xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); - } /* @@ -3090,6 +3088,131 @@ xfs_btree_split( #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ +/* Move the records from a root leaf block to a separate block. */ +STATIC void +xfs_btree_promote_leaf_iroot( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + struct xfs_buf *cbp, + union xfs_btree_ptr *cptr, + struct xfs_btree_block *cblock) +{ + union xfs_btree_rec *rp; + union xfs_btree_rec *crp; + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + struct xfs_btree_block *broot; + int numrecs = xfs_btree_get_numrecs(block); + + /* Copy the records from the leaf broot into the new child block. */ + rp = xfs_btree_rec_addr(cur, 1, block); + crp = xfs_btree_rec_addr(cur, 1, cblock); + xfs_btree_copy_recs(cur, crp, rp, numrecs); + + /* + * Increment the tree height. + * + * Trickery here: The amount of memory that we need per record for the + * ifork's btree root block may change when we convert the broot from a + * leaf to a node block. Free the existing leaf broot so that nobody + * thinks we need to migrate node pointers when we realloc the broot + * buffer after bumping nlevels. + */ + cur->bc_ops->broot_realloc(cur, 0); + cur->bc_nlevels++; + cur->bc_levels[1].ptr = 1; + + /* + * Allocate a new node broot and initialize it to point to the new + * child block. + */ + broot = cur->bc_ops->broot_realloc(cur, 1); + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, + cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino); + + pp = xfs_btree_ptr_addr(cur, 1, broot); + kp = xfs_btree_key_addr(cur, 1, broot); + xfs_btree_copy_ptrs(cur, pp, cptr, 1); + xfs_btree_get_keys(cur, cblock, kp); + + /* Attach the new block to the cursor and log it. */ + xfs_btree_setbuf(cur, 0, cbp); + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, cbp, 1, numrecs); +} + +/* + * Move the keys and pointers from a root block to a separate block. + * + * Since the keyptr size does not change, all we have to do is increase the + * tree height, copy the keyptrs to the new internal node (cblock), shrink + * the root, and copy the pointers there. + */ +STATIC int +xfs_btree_promote_node_iroot( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *cbp, + union xfs_btree_ptr *cptr, + struct xfs_btree_block *cblock) +{ + union xfs_btree_key *ckp; + union xfs_btree_key *kp; + union xfs_btree_ptr *cpp; + union xfs_btree_ptr *pp; + int i; + int error; + int numrecs = xfs_btree_get_numrecs(block); + + /* + * Increase tree height, adjusting the root block level to match. + * We cannot change the root btree node size until we've copied the + * block contents to the new child block. + */ + be16_add_cpu(&block->bb_level, 1); + cur->bc_nlevels++; + cur->bc_levels[level + 1].ptr = 1; + + /* + * Adjust the root btree record count, then copy the keys from the old + * root to the new child block. + */ + xfs_btree_set_numrecs(block, 1); + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, numrecs); + + /* Check the pointers and copy them to the new child block. */ + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); + for (i = 0; i < numrecs; i++) { + error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (error) + return error; + } + xfs_btree_copy_ptrs(cur, cpp, pp, numrecs); + + /* + * Set the first keyptr to point to the new child block, then shrink + * the memory buffer for the root block. + */ + error = xfs_btree_debug_check_ptr(cur, cptr, 0, level); + if (error) + return error; + xfs_btree_copy_ptrs(cur, pp, cptr, 1); + xfs_btree_get_keys(cur, cblock, kp); + + cur->bc_ops->broot_realloc(cur, 1); + + /* Attach the new block to the cursor and log it. */ + xfs_btree_setbuf(cur, level, cbp); + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, numrecs); + xfs_btree_log_ptrs(cur, cbp, 1, numrecs); + return 0; +} + /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -3103,14 +3226,10 @@ xfs_btree_new_iroot( struct xfs_buf *cbp; /* buffer for cblock */ struct xfs_btree_block *block; /* btree block */ struct xfs_btree_block *cblock; /* child btree block */ - union xfs_btree_key *ckp; /* child key pointer */ - union xfs_btree_ptr *cpp; /* child ptr pointer */ - union xfs_btree_key *kp; /* pointer to btree key */ - union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr aptr; union xfs_btree_ptr nptr; /* new block addr */ int level; /* btree level */ int error; /* error return code */ - int i; /* loop counter */ XFS_BTREE_STATS_INC(cur, newroot); @@ -3119,10 +3238,15 @@ xfs_btree_new_iroot( level = cur->bc_nlevels - 1; block = xfs_btree_get_iroot(cur); - pp = xfs_btree_ptr_addr(cur, 1, block); + ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)); + if (level > 0) + aptr = *xfs_btree_ptr_addr(cur, 1, block); + else + aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp, + cur->bc_ino.ip->i_ino)); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = xfs_btree_alloc_block(cur, pp, &nptr, stat); + error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat); if (error) goto error0; if (*stat == 0) @@ -3148,47 +3272,16 @@ xfs_btree_new_iroot( cblock->bb_u.s.bb_blkno = bno; } - be16_add_cpu(&block->bb_level, 1); - xfs_btree_set_numrecs(block, 1); - cur->bc_nlevels++; - ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); - cur->bc_levels[level + 1].ptr = 1; - - kp = xfs_btree_key_addr(cur, 1, block); - ckp = xfs_btree_key_addr(cur, 1, cblock); - xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); - - cpp = xfs_btree_ptr_addr(cur, 1, cblock); - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (level > 0) { + error = xfs_btree_promote_node_iroot(cur, block, level, cbp, + &nptr, cblock); if (error) goto error0; + } else { + xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock); } - xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); - - error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); - if (error) - goto error0; - - xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - - xfs_iroot_realloc(cur->bc_ino.ip, - 1 - xfs_btree_get_numrecs(cblock), - cur->bc_ino.whichfork); - - xfs_btree_setbuf(cur, level, cbp); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); - xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); - xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); - - *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); + *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3359,7 +3452,7 @@ xfs_btree_make_block_unfull( if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); + cur->bc_ops->broot_realloc(cur, numrecs + 1); *stat = 1; } else { /* A root block that needs replacing */ @@ -3569,14 +3662,31 @@ xfs_btree_insrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we just inserted into a new tree block, we have to - * recalculate nkey here because nkey is out of date. + * Update btree keys to reflect the newly added record or keyptr. + * There are three cases here to be aware of. Normally, all we have to + * do is walk towards the root, updating keys as necessary. + * + * If the caller had us target a full block for the insertion, we dealt + * with that by calling the _make_block_unfull function. If the + * "make unfull" function splits the block, it'll hand us back the key + * and pointer of the new block. We haven't yet added the new block to + * the next level up, so if we decide to add the new record to the new + * block (bp->b_bn != old_bn), we have to update the caller's pointer + * so that the caller adds the new block with the correct key. * - * Otherwise we're just updating an existing block (having shoved - * some records into the new tree block), so use the regular key - * update mechanism. + * However, there is a third possibility-- if the selected block is the + * root block of an inode-rooted btree and cannot be expanded further, + * the "make unfull" function moves the root block contents to a new + * block and updates the root block to point to the new block. In this + * case, no block pointer is passed back because the block has already + * been added to the btree. In this case, we need to use the regular + * key update function, just like the first case. This is critical for + * overlapping btrees, because the high key must be updated to reflect + * the entire tree, not just the subtree accessible through the first + * child of the root (which is now two levels down from the root). */ - if (bp && xfs_buf_daddr(bp) != old_bn) { + if (!xfs_btree_ptr_is_null(cur, &nptr) && + bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -3688,6 +3798,97 @@ error0: return error; } +/* Move the records from a child leaf block to the root block. */ +STATIC void +xfs_btree_demote_leaf_child( + struct xfs_btree_cur *cur, + struct xfs_btree_block *cblock, + int numrecs) +{ + union xfs_btree_rec *rp; + union xfs_btree_rec *crp; + struct xfs_btree_block *broot; + + /* + * Decrease the tree height. + * + * Trickery here: The amount of memory that we need per record for the + * ifork's btree root block may change when we convert the broot from a + * node to a leaf. Free the old node broot so that we can get a fresh + * leaf broot. + */ + cur->bc_ops->broot_realloc(cur, 0); + cur->bc_nlevels--; + + /* + * Allocate a new leaf broot and copy the records from the old child. + * Detach the old child from the cursor. + */ + broot = cur->bc_ops->broot_realloc(cur, numrecs); + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs, + cur->bc_ino.ip->i_ino); + + rp = xfs_btree_rec_addr(cur, 1, broot); + crp = xfs_btree_rec_addr(cur, 1, cblock); + xfs_btree_copy_recs(cur, rp, crp, numrecs); + + cur->bc_levels[0].bp = NULL; +} + +/* + * Move the keyptrs from a child node block to the root block. + * + * Since the keyptr size does not change, all we have to do is increase the + * tree height, copy the keyptrs to the new internal node (cblock), shrink + * the root, and copy the pointers there. + */ +STATIC int +xfs_btree_demote_node_child( + struct xfs_btree_cur *cur, + struct xfs_btree_block *cblock, + int level, + int numrecs) +{ + struct xfs_btree_block *block; + union xfs_btree_key *ckp; + union xfs_btree_key *kp; + union xfs_btree_ptr *cpp; + union xfs_btree_ptr *pp; + int i; + int error; + + /* + * Adjust the root btree node size and the record count to match the + * doomed child so that we can copy the keyptrs ahead of changing the + * tree shape. + */ + block = cur->bc_ops->broot_realloc(cur, numrecs); + + xfs_btree_set_numrecs(block, numrecs); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + /* Copy keys from the doomed block. */ + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + /* Copy pointers from the doomed block. */ + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); + for (i = 0; i < numrecs; i++) { + error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); + if (error) + return error; + } + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + /* Decrease tree height, adjusting the root block level to match. */ + cur->bc_levels[level - 1].bp = NULL; + be16_add_cpu(&block->bb_level, -1); + cur->bc_nlevels--; + return 0; +} + /* * Try to merge a non-leaf block back into the inode root. * @@ -3700,34 +3901,31 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; - union xfs_btree_key *kp; - union xfs_btree_key *ckp; - union xfs_btree_ptr *pp; - union xfs_btree_ptr *cpp; struct xfs_buf *cbp; int level; - int index; int numrecs; int error; #ifdef DEBUG union xfs_btree_ptr ptr; #endif - int i; ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); - ASSERT(cur->bc_nlevels > 1); + ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) || + cur->bc_nlevels > 1); /* * Don't deal with the root block needs to be a leaf case. * We're just going to turn the thing back into extents anyway. */ level = cur->bc_nlevels - 1; - if (level == 1) + if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) + goto out0; + + /* If we're already a leaf, jump out. */ + if (level == 0) goto out0; /* @@ -3757,40 +3955,20 @@ xfs_btree_kill_iroot( ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); #endif - index = numrecs - cur->bc_ops->get_maxrecs(cur, level); - if (index) { - xfs_iroot_realloc(cur->bc_ino.ip, index, - cur->bc_ino.whichfork); - block = ifp->if_broot; - } - - be16_add_cpu(&block->bb_numrecs, index); - ASSERT(block->bb_numrecs == cblock->bb_numrecs); - - kp = xfs_btree_key_addr(cur, 1, block); - ckp = xfs_btree_key_addr(cur, 1, cblock); - xfs_btree_copy_keys(cur, kp, ckp, numrecs); - - pp = xfs_btree_ptr_addr(cur, 1, block); - cpp = xfs_btree_ptr_addr(cur, 1, cblock); - - for (i = 0; i < numrecs; i++) { - error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); + if (level > 1) { + error = xfs_btree_demote_node_child(cur, cblock, level, + numrecs); if (error) return error; - } - - xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + } else + xfs_btree_demote_leaf_child(cur, cblock, numrecs); error = xfs_btree_free_block(cur, cbp); if (error) return error; - cur->bc_levels[level - 1].bp = NULL; - be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); - cur->bc_nlevels--; out0: return 0; } @@ -3944,10 +4122,10 @@ xfs_btree_delrec( /* * We're at the root level. First, shrink the root block in-memory. * Try to get rid of the next level down. If we can't then there's - * nothing left to do. + * nothing left to do. numrecs was decremented above. */ if (xfs_btree_at_iroot(cur, level)) { - xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); + cur->bc_ops->broot_realloc(cur, numrecs); error = xfs_btree_kill_iroot(cur); if (error) @@ -4745,7 +4923,7 @@ xfs_btree_agblock_v5hdr_verify( return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; - if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) return __this_address; return NULL; } @@ -5156,7 +5334,7 @@ xfs_btree_count_blocks_helper( int level, void *data) { - xfs_extlen_t *blocks = data; + xfs_filblks_t *blocks = data; (*blocks)++; return 0; @@ -5166,7 +5344,7 @@ xfs_btree_count_blocks_helper( int xfs_btree_count_blocks( struct xfs_btree_cur *cur, - xfs_extlen_t *blocks) + xfs_filblks_t *blocks) { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, @@ -5355,6 +5533,12 @@ xfs_btree_init_cur_caches(void) error = xfs_refcountbt_init_cur_cache(); if (error) goto err; + error = xfs_rtrmapbt_init_cur_cache(); + if (error) + goto err; + error = xfs_rtrefcountbt_init_cur_cache(); + if (error) + goto err; return 0; err: @@ -5371,6 +5555,8 @@ xfs_btree_destroy_cur_caches(void) xfs_bmbt_destroy_cur_cache(); xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); + xfs_rtrmapbt_destroy_cur_cache(); + xfs_rtrefcountbt_destroy_cur_cache(); } /* Move the btree cursor before the first record. */ @@ -5399,3 +5585,67 @@ xfs_btree_goto_left_edge( return 0; } + +/* Allocate a block for an inode-rooted metadata btree. */ +int +xfs_btree_alloc_metafile_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_alloc_arg args = { + .mp = cur->bc_mp, + .tp = cur->bc_tp, + .resv = XFS_AG_RESV_METAFILE, + .minlen = 1, + .maxlen = 1, + .prod = 1, + }; + struct xfs_inode *ip = cur->bc_ino.ip; + int error; + + ASSERT(xfs_is_metadir_inode(ip)); + + xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) { + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + + xfs_metafile_resv_alloc_space(ip, &args); + + new->l = cpu_to_be64(args.fsbno); + *stat = 1; + return 0; +} + +/* Free a block from an inode-rooted metadata btree. */ +int +xfs_btree_free_metafile_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_ino.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + int error; + + ASSERT(xfs_is_metadir_inode(ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); + error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE, + 0); + if (error) + return error; + + xfs_metafile_resv_free_space(ip, tp, 1); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 10b7ddc3b2b3..355b304696e6 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -135,7 +135,7 @@ struct xfs_btree_ops { /* offset of btree stats array */ unsigned int statoff; - /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */ + /* sick mask for health reporting (not for bmap btrees) */ unsigned int sick_mask; /* cursor operations */ @@ -213,11 +213,27 @@ struct xfs_btree_ops { const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); + + /* + * Reallocate the space for if_broot to fit the number of records. + * Move the records and pointers in if_broot to fit the new size. When + * shrinking this will eliminate holes between the records and pointers + * created by the caller. When growing this will create holes to be + * filled in by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then if + * we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although it + * can go to zero. + */ + struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur, + unsigned int new_numrecs); }; /* btree geometry flags */ #define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ - +#define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */ union xfs_btree_irec { struct xfs_alloc_rec_incore a; @@ -254,6 +270,7 @@ struct xfs_btree_cur union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ + struct xfs_group *bc_group; /* per-type information */ union { @@ -264,13 +281,11 @@ struct xfs_btree_cur struct xbtree_ifakeroot *ifake; /* for staging cursor */ } bc_ino; struct { - struct xfs_perag *pag; struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; struct { struct xfbtree *xfbtree; - struct xfs_perag *pag; } bc_mem; }; @@ -282,7 +297,7 @@ struct xfs_btree_cur struct { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ - } bc_refc; /* refcountbt */ + } bc_refc; /* refcountbt/rtrefcountbt */ }; /* Must be at the end of the struct! */ @@ -485,7 +500,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); -int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); @@ -688,4 +703,10 @@ xfs_btree_at_iroot( level == cur->bc_nlevels - 1; } +int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *newp, + int *stat); +int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur, + struct xfs_buf *bp); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c index 036061fe32cc..f2f7b4305413 100644 --- a/fs/xfs/libxfs/xfs_btree_mem.c +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -18,6 +18,7 @@ #include "xfs_ag.h" #include "xfs_buf_item.h" #include "xfs_trace.h" +#include "xfs_rtgroup.h" /* Set the root of an in-memory btree. */ void @@ -57,10 +58,8 @@ xfbtree_dup_cursor( ncur->bc_flags = cur->bc_flags; ncur->bc_nlevels = cur->bc_nlevels; ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; - - if (cur->bc_mem.pag) - ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag); - + if (cur->bc_group) + ncur->bc_group = xfs_group_hold(cur->bc_group); return ncur; } diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 694929703152..5ed84f9cc877 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -134,6 +134,7 @@ xfs_btree_stage_ifakeroot( cur->bc_ino.ifake = ifake; cur->bc_nlevels = ifake->if_levels; cur->bc_ino.forksize = ifake->if_fork_size; + cur->bc_ino.whichfork = XFS_STAGING_FORK; cur->bc_flags |= XFS_BTREE_STAGING; } @@ -573,6 +574,7 @@ xfs_btree_bload_compute_geometry( struct xfs_btree_bload *bbl, uint64_t nr_records) { + const struct xfs_btree_ops *ops = cur->bc_ops; uint64_t nr_blocks = 0; uint64_t nr_this_level; @@ -599,7 +601,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + if (ops->type == XFS_BTREE_TYPE_INODE) { /* * If all the items we want to store at this level * would fit in the inode root block, then we have our @@ -607,7 +609,9 @@ xfs_btree_bload_compute_geometry( * * Note that bmap btrees forbid records in the root. */ - if (level != 0 && nr_this_level <= avg_per_block) { + if ((level != 0 || + (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) && + nr_this_level <= avg_per_block) { nr_blocks++; break; } @@ -658,7 +662,7 @@ xfs_btree_bload_compute_geometry( return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) + if (ops->type == XFS_BTREE_TYPE_INODE) bbl->nr_blocks = nr_blocks - 1; else bbl->nr_blocks = nr_blocks; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 2cd212ad2c1d..5b377cbbb1f7 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -846,6 +846,12 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + if (!ops->finish_item) { + ASSERT(ops->finish_item != NULL); + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); + return NULL; + } + dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) dfp = xfs_defer_alloc(&tp->t_dfops, ops); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 8b338031e487..9effd95ddcd4 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -68,9 +68,12 @@ struct xfs_defer_op_type { extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 202468223bf9..1775abcfa04d 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -197,7 +197,7 @@ xfs_da_unmount( /* * Return 1 if directory contains only "." and "..". */ -int +static bool xfs_dir_isempty( xfs_inode_t *dp) { @@ -205,9 +205,9 @@ xfs_dir_isempty( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ - return 1; + return true; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) - return 0; + return false; sfp = dp->i_df.if_data; return !sfp->count; } @@ -379,12 +379,11 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; - args->value = kmalloc(len, + args->value = kmemdup(name, len, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; - memcpy(args->value, name, len); args->valuelen = len; return -EEXIST; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 576068ed81fa..a6594a5a941d 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -58,7 +58,6 @@ extern void xfs_dir_startup(void); extern int xfs_da_mount(struct xfs_mount *mp); extern void xfs_da_unmount(struct xfs_mount *mp); -extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 15a362e2f5ea..dceef2abd4e2 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -16,6 +16,9 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_metadir.h" +#include "xfs_metafile.h" int xfs_calc_dquots_per_chunk( @@ -323,3 +326,190 @@ xfs_dquot_to_disk_ts( return cpu_to_be32(t); } + +inline unsigned int +xfs_dqinode_sick_mask(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_SICK_FS_UQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_SICK_FS_GQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_SICK_FS_PQUOTA; + } + + ASSERT(0); + return 0; +} + +/* + * Load the inode for a given type of quota, assuming that the sb fields have + * been sorted out. This is not true when switching quota types on a V4 + * filesystem, so do not use this function for that. If metadir is enabled, + * @dp must be the /quota metadir. + * + * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on + * success; or a negative errno. + */ +int +xfs_dqinode_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + enum xfs_metafile_type metafile_type = xfs_dqinode_metafile_type(type); + int error; + + if (!xfs_has_metadir(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_DQTYPE_USER: + ino = mp->m_sb.sb_uquotino; + break; + case XFS_DQTYPE_GROUP: + ino = mp->m_sb.sb_gquotino; + break; + case XFS_DQTYPE_PROJ: + ino = mp->m_sb.sb_pquotino; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + /* Should have set 0 to NULLFSINO when loading superblock */ + if (ino == NULLFSINO) + return -ENOENT; + + error = xfs_trans_metafile_iget(tp, ino, metafile_type, &ip); + } else { + error = xfs_metadir_load(tp, dp, xfs_dqinode_path(type), + metafile_type, &ip); + if (error == -ENOENT) + return error; + } + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return error; + } + + if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != 0)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + +/* Create a metadata directory quota inode. */ +int +xfs_dqinode_metadir_create( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + }; + int error; + + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + error = xfs_metadir_commit(&upd); + if (error) + return error; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; +} + +#ifndef __KERNEL__ +/* Link a metadata directory quota inode. */ +int +xfs_dqinode_metadir_link( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode *ip) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + .ip = ip, + }; + int error; + + error = xfs_metadir_start_link(&upd); + if (error) + return error; + + error = xfs_metadir_link(&upd); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + return xfs_metadir_commit(&upd); +} +#endif /* __KERNEL__ */ + +/* Create the parent directory for all quota inodes and load it. */ +int +xfs_dqinode_mkdir_parent( + struct xfs_mount *mp, + struct xfs_inode **dpp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "quota", dpp); +} + +/* + * Load the parent directory of all quota inodes. Pass the inode to the caller + * because quota functions (e.g. QUOTARM) can be called on the quota files even + * if quotas are not enabled. + */ +int +xfs_dqinode_load_parent( + struct xfs_trans *tp, + struct xfs_inode **dpp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "quota", XFS_METAFILE_DIR, + dpp); +} diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 7002d7676a78..a53c5d40e084 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -64,7 +64,8 @@ #define XFS_ERRTAG_WB_DELAY_MS 42 #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 -#define XFS_ERRTAG_MAX 45 +#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 +#define XFS_ERRTAG_MAX 46 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -113,5 +114,6 @@ #define XFS_RANDOM_WB_DELAY_MS 3000 #define XFS_RANDOM_WRITE_DELAY_MS 3000 #define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 +#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 2021396651de..3f1d6a98c118 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -662,7 +662,9 @@ xfs_exchmaps_rmapbt_blocks( if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(req->ip1)) - return 0; + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * + XFS_RTRMAPADD_SPACE_RES(mp); return howmany_64(req->nr_exchanges, XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e1bfee0c3b1a..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -174,6 +174,15 @@ typedef struct xfs_sb { xfs_lsn_t sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ + xfs_ino_t sb_metadirino; /* metadata directory tree root */ + + xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ + xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ + uint8_t sb_rgblklog; /* rt group number shift */ + uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -259,7 +268,20 @@ struct xfs_dsb { __be64 sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ - /* must be padded to 64 bit alignment */ + __be64 sb_metadirino; /* metadata directory tree root */ + __be32 sb_rgcount; /* # of realtime groups */ + __be32 sb_rgextents; /* size of rtgroup in rtx */ + __u8 sb_rgblklog; /* rt group number shift */ + __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ + + /* + * The size of this structure must be padded to 64 bit alignment. + * + * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when + * adding new fields here. + */ }; #define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc) @@ -278,7 +300,7 @@ struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -287,12 +309,12 @@ static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. */ -static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp) { return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) || (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); @@ -342,8 +364,8 @@ static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) #define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL static inline bool xfs_sb_has_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_compat & feature) != 0; } @@ -360,8 +382,8 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_ro_compat & feature) != 0; } @@ -374,6 +396,10 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ +#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE | \ XFS_SB_FEAT_INCOMPAT_SPINODES | \ @@ -382,13 +408,16 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \ XFS_SB_FEAT_INCOMPAT_NREXT64 | \ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ - XFS_SB_FEAT_INCOMPAT_PARENT) + XFS_SB_FEAT_INCOMPAT_PARENT | \ + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool xfs_sb_has_incompat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_incompat & feature) != 0; } @@ -399,8 +428,8 @@ xfs_sb_has_incompat_feature( #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_log_incompat & feature) != 0; } @@ -420,7 +449,7 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } -static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); @@ -694,20 +723,57 @@ struct xfs_agfl { /* * Realtime bitmap information is accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_rtword_raw { __u32 old; + __be32 rtg; }; /* * Realtime summary counts are accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_suminfo_raw { __u32 old; + __be32 rtg; +}; + +/* + * Realtime allocation groups break the rt section into multiple pieces that + * could be locked independently. Realtime block group numbers are 32-bit + * quantities. Block numbers within a group are also 32-bit quantities, but + * the upper bit must never be set. rtgroup 0 might have a superblock in it, + * so the minimum size of an rtgroup is 2 rtx. + */ +#define XFS_MAX_RGBLOCKS ((xfs_rgblock_t)(1U << 31) - 1) +#define XFS_MIN_RGEXTENTS ((xfs_rtxlen_t)2) +#define XFS_MAX_RGNUMBER ((xfs_rgnumber_t)(-1U)) + +#define XFS_RTSB_MAGIC 0x46726F67 /* 'Frog' */ + +/* + * Realtime superblock - on disk version. Must be padded to 64 bit alignment. + * The first block of the realtime volume contains this superblock. + */ +struct xfs_rtsb { + __be32 rsb_magicnum; /* magic number == XFS_RTSB_MAGIC */ + __le32 rsb_crc; /* superblock crc */ + + __be32 rsb_pad; /* zero */ + unsigned char rsb_fname[XFSLABEL_MAX]; /* file system name */ + + uuid_t rsb_uuid; /* user-visible file system unique id */ + uuid_t rsb_meta_uuid; /* metadata file system unique id */ + + /* must be padded to 64 bit alignment */ }; +#define XFS_RTSB_CRC_OFF offsetof(struct xfs_rtsb, rsb_crc) +#define XFS_RTSB_DADDR ((xfs_daddr_t)0) /* daddr in rt section */ + /* * XFS Timestamps * ============== @@ -790,6 +856,31 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; } +enum xfs_metafile_type { + XFS_METAFILE_UNKNOWN, /* unknown */ + XFS_METAFILE_DIR, /* metadir directory */ + XFS_METAFILE_USRQUOTA, /* user quota */ + XFS_METAFILE_GRPQUOTA, /* group quota */ + XFS_METAFILE_PRJQUOTA, /* project quota */ + XFS_METAFILE_RTBITMAP, /* rt bitmap */ + XFS_METAFILE_RTSUMMARY, /* rt summary */ + XFS_METAFILE_RTRMAP, /* rt rmap */ + XFS_METAFILE_RTREFCOUNT, /* rt refcount */ + + XFS_METAFILE_MAX +} __packed; + +#define XFS_METAFILE_TYPE_STR \ + { XFS_METAFILE_UNKNOWN, "unknown" }, \ + { XFS_METAFILE_DIR, "dir" }, \ + { XFS_METAFILE_USRQUOTA, "usrquota" }, \ + { XFS_METAFILE_GRPQUOTA, "grpquota" }, \ + { XFS_METAFILE_PRJQUOTA, "prjquota" }, \ + { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \ + { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \ + { XFS_METAFILE_RTRMAP, "rtrmap" }, \ + { XFS_METAFILE_RTREFCOUNT, "rtrefcount" } + /* * On-disk inode structure. * @@ -812,7 +903,7 @@ struct xfs_dinode { __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ __u8 di_format; /* format of di_c data */ - __be16 di_onlink; /* old number of links to file */ + __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */ __be32 di_uid; /* owner's user id */ __be32 di_gid; /* owner's group id */ __be32 di_nlink; /* number of links to file */ @@ -868,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ @@ -917,7 +1013,8 @@ enum xfs_dinode_fmt { XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* added long ago, but never used */ + XFS_DINODE_FMT_UUID, /* added long ago, but never used */ + XFS_DINODE_FMT_META_BTREE, /* metadata btree */ }; #define XFS_INODE_FORMAT_STR \ @@ -925,7 +1022,8 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_LOCAL, "local" }, \ { XFS_DINODE_FMT_EXTENTS, "extent" }, \ { XFS_DINODE_FMT_BTREE, "btree" }, \ - { XFS_DINODE_FMT_UUID, "uuid" } + { XFS_DINODE_FMT_UUID, "uuid" }, \ + { XFS_DINODE_FMT_META_BTREE, "meta_btree" } /* * Max values for extnum and aextnum. @@ -1088,21 +1186,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * Values for di_flags2 These start by being exposed to userspace in the upper * 16 bits of the XFS_XFLAG_s range. */ -#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ -#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ -#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ -#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ -#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ +/* use DAX for this inode */ +#define XFS_DIFLAG2_DAX_BIT 0 + +/* file's blocks may be shared */ +#define XFS_DIFLAG2_REFLINK_BIT 1 + +/* copy on write extent size hint */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 -#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) -#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) -#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) -#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) +/* big timestamps */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 + +/* large extent counters */ +#define XFS_DIFLAG2_NREXT64_BIT 4 + +/* + * The inode contains filesystem metadata and can be found through the metadata + * directory tree. Metadata inodes must satisfy the following constraints: + * + * - V5 filesystem (and ftype) are enabled; + * - The only valid modes are regular files and directories; + * - The access bits must be zero; + * - DMAPI event and state masks are zero; + * - The user and group IDs must be zero; + * - The project ID can be used as a u32 annotation; + * - The immutable, sync, noatime, nodump, nodefrag flags must be set. + * - The dax flag must not be set. + * - Directories must have nosymlinks set. + * + * These requirements are chosen defensively to minimize the ability of + * userspace to read or modify the contents, should a metadata file ever + * escape to userspace. + * + * There are further constraints on the directory tree itself: + * + * - Metadata inodes must never be resolvable through the root directory; + * - They must never be accessed by userspace; + * - Metadata directory entries must have correct ftype. + * + * Superblock-rooted metadata files must have the METADATA iflag set even + * though they do not have a parent directory. + */ +#define XFS_DIFLAG2_METADATA_BIT 5 + +#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT) +#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1117,6 +1254,12 @@ static inline bool xfs_dinode_has_large_extent_counts( (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); } +static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1165,6 +1308,24 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ /* + * RT bit manipulation macros. + */ +#define XFS_RTBITMAP_MAGIC 0x424D505A /* BMPZ */ +#define XFS_RTSUMMARY_MAGIC 0x53554D59 /* SUMY */ + +struct xfs_rtbuf_blkinfo { + __be32 rt_magic; /* validity check on block */ + __be32 rt_crc; /* CRC of block */ + __be64 rt_owner; /* inode that owns the block */ + __be64 rt_blkno; /* first block of the buffer */ + __be64 rt_lsn; /* sequence number of last write */ + uuid_t rt_uuid; /* filesystem we belong to */ +}; + +#define XFS_RTBUF_CRC_OFF \ + offsetof(struct xfs_rtbuf_blkinfo, rt_crc) + +/* * Dquot and dquot block format definitions */ #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ @@ -1583,6 +1744,24 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Realtime Reverse mapping btree format definitions + * + * This is a btree for reverse mapping records for realtime volumes + */ +#define XFS_RTRMAP_CRC_MAGIC 0x4d415052 /* 'MAPR' */ + +/* + * rtrmap root header, on-disk form only. + */ +struct xfs_rtrmap_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-based btree pointer type */ +typedef __be64 xfs_rtrmap_ptr_t; + +/* * Reference Count Btree format definitions * */ @@ -1625,12 +1804,29 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -#define MAXREFCOUNT ((xfs_nlink_t)~0U) -#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) +#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U) +#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U) /* btree pointer type */ typedef __be32 xfs_refcount_ptr_t; +/* + * Realtime Reference Count btree format definitions + * + * This is a btree for reference count records for realtime volumes + */ +#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */ + +/* + * rt refcount root header, on-disk form only. + */ +struct xfs_rtrefcount_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-rooted btree pointer type */ +typedef __be64 xfs_rtrefcount_ptr_t; /* * BMAP Btree format definitions diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 860284064c5a..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -187,7 +187,11 @@ struct xfs_fsop_geom { __u32 logsunit; /* log stripe unit, bytes */ uint32_t sick; /* o: unhealthy fs & rt metadata */ uint32_t checked; /* o: checked fs & rt metadata */ - __u64 reserved[17]; /* reserved space */ + __u32 rgextents; /* rt extents in a realtime group */ + __u32 rgcount; /* number of realtime groups */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -198,6 +202,8 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ #define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ #define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ +#define XFS_FSOP_GEOM_SICK_METADIR (1 << 8) /* metadata directory */ +#define XFS_FSOP_GEOM_SICK_METAPATH (1 << 9) /* metadir tree path */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -242,6 +248,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ +#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -489,9 +497,17 @@ struct xfs_bulk_ireq { */ #define XFS_BULK_IREQ_NREXT64 (1U << 2) +/* + * Allow bulkstat to return information about metadata directories. This + * enables xfs_scrub to find them for scanning, as they are otherwise ordinary + * directories. + */ +#define XFS_BULK_IREQ_METADIR (1U << 3) + #define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ XFS_BULK_IREQ_SPECIAL | \ - XFS_BULK_IREQ_NREXT64) + XFS_BULK_IREQ_NREXT64 | \ + XFS_BULK_IREQ_METADIR) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -722,9 +738,13 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ #define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ #define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ +#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */ +#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */ +#define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */ +#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 29 +#define XFS_SCRUB_TYPE_NR 33 /* * This special type code only applies to the vectored scrub implementation. @@ -803,6 +823,24 @@ struct xfs_scrub_vec_head { #define XFS_SCRUB_VEC_FLAGS_ALL (0) /* + * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for + * path checking. + */ +#define XFS_SCRUB_METAPATH_PROBE (0) /* do we have a metapath scrubber? */ +#define XFS_SCRUB_METAPATH_RTDIR (1) /* rtrgroups metadir */ +#define XFS_SCRUB_METAPATH_RTBITMAP (2) /* per-rtg bitmap */ +#define XFS_SCRUB_METAPATH_RTSUMMARY (3) /* per-rtg summary */ +#define XFS_SCRUB_METAPATH_QUOTADIR (4) /* quota metadir */ +#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */ +#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */ +#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */ +#define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */ +#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */ + +/* Number of metapath sm_ino values */ +#define XFS_SCRUB_METAPATH_NR (10) + +/* * ioctl limits */ #ifdef XATTR_LIST_MAX @@ -949,6 +987,23 @@ struct xfs_getparents_by_handle { }; /* + * Output for XFS_IOC_RTGROUP_GEOMETRY + */ +struct xfs_rtgroup_geometry { + __u32 rg_number; /* i/o: rtgroup number */ + __u32 rg_length; /* o: length in blocks */ + __u32 rg_sick; /* o: sick things in ag */ + __u32 rg_checked; /* o: checked metadata in ag */ + __u32 rg_flags; /* i/o: flags for this ag */ + __u32 rg_reserved[27]; /* o: zero */ +}; +#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ +#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ +#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ +#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ +#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ + +/* * ioctl commands that are used by Linux filesystems */ #define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS @@ -986,6 +1041,7 @@ struct xfs_getparents_by_handle { #define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents) #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) #define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) +#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) /* * ioctl commands that replace IRIX syssgi()'s @@ -1026,6 +1082,15 @@ struct xfs_getparents_by_handle { #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c new file mode 100644 index 000000000000..e9d76bcdc820 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_extent_busy.h" +#include "xfs_group.h" + +/* + * Groups can have passive and active references. + * + * For passive references the code freeing a group is responsible for cleaning + * up objects that hold the passive references (e.g. cached buffers). + * Routines manipulating passive references are xfs_group_get, xfs_group_hold + * and xfs_group_put. + * + * Active references are for short term access to the group for walking trees or + * accessing state. If a group is being shrunk or offlined, the lookup will fail + * to find that group and return NULL instead. + * Routines manipulating active references are xfs_group_grab and + * xfs_group_rele. + */ + +struct xfs_group * +xfs_group_get( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_get(xg, _RET_IP_); + ASSERT(atomic_read(&xg->xg_ref) >= 0); + atomic_inc(&xg->xg_ref); + } + rcu_read_unlock(); + return xg; +} + +struct xfs_group * +xfs_group_hold( + struct xfs_group *xg) +{ + ASSERT(atomic_read(&xg->xg_ref) > 0 || + atomic_read(&xg->xg_active_ref) > 0); + + trace_xfs_group_hold(xg, _RET_IP_); + atomic_inc(&xg->xg_ref); + return xg; +} + +void +xfs_group_put( + struct xfs_group *xg) +{ + trace_xfs_group_put(xg, _RET_IP_); + + ASSERT(atomic_read(&xg->xg_ref) > 0); + atomic_dec(&xg->xg_ref); +} + +struct xfs_group * +xfs_group_grab( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_grab(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +/* + * Iterate to the next group. To start the iteration at @start_index, a %NULL + * @xg is passed, else the previous group returned from this function. The + * caller should break out of the loop when this returns %NULL. If the caller + * wants to break out of a loop that did not finish it needs to release the + * active reference to @xg using xfs_group_rele() itself. + */ +struct xfs_group * +xfs_group_next_range( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t start_index, + uint32_t end_index, + enum xfs_group_type type) +{ + uint32_t index = start_index; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + if (index > end_index) + return NULL; + return xfs_group_grab(mp, index, type); +} + +/* + * Find the next group after @xg, or the first group if @xg is NULL. + */ +struct xfs_group * +xfs_group_grab_next_mark( + struct xfs_mount *mp, + struct xfs_group *xg, + xa_mark_t mark, + enum xfs_group_type type) +{ + unsigned long index = 0; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + + rcu_read_lock(); + xg = xa_find(&mp->m_groups[type].xa, &index, ULONG_MAX, mark); + if (xg) { + trace_xfs_group_grab_next_tag(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +void +xfs_group_rele( + struct xfs_group *xg) +{ + trace_xfs_group_rele(xg, _RET_IP_); + atomic_dec(&xg->xg_active_ref); +} + +void +xfs_group_free( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type, + void (*uninit)(struct xfs_group *xg)) +{ + struct xfs_group *xg = xa_erase(&mp->m_groups[type].xa, index); + + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_ref) != 0); + + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + + if (uninit) + uninit(xg); + + /* drop the mount's active reference */ + xfs_group_rele(xg); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0); + kfree_rcu_mightsleep(xg); +} + +int +xfs_group_insert( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t index, + enum xfs_group_type type) +{ + int error; + + xg->xg_mount = mp; + xg->xg_gno = index; + xg->xg_type = type; + +#ifdef __KERNEL__ + xg->xg_busy_extents = xfs_extent_busy_alloc(); + if (!xg->xg_busy_extents) + return -ENOMEM; + spin_lock_init(&xg->xg_state_lock); + xfs_hooks_init(&xg->xg_rmap_update_hooks); +#endif + xfs_defer_drain_init(&xg->xg_intents_drain); + + /* Active ref owned by mount indicates group is online. */ + atomic_set(&xg->xg_active_ref, 1); + + error = xa_insert(&mp->m_groups[type].xa, index, xg, GFP_KERNEL); + if (error) { + WARN_ON_ONCE(error == -EBUSY); + goto out_drain; + } + + return 0; +out_drain: + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + return error; +} + +struct xfs_group * +xfs_group_get_by_fsb( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return xfs_group_get(mp, xfs_fsb_to_gno(mp, fsbno, type), type); +} diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h new file mode 100644 index 000000000000..4423932a2313 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + */ +#ifndef __LIBXFS_GROUP_H +#define __LIBXFS_GROUP_H 1 + +struct xfs_group { + struct xfs_mount *xg_mount; + uint32_t xg_gno; + enum xfs_group_type xg_type; + atomic_t xg_ref; /* passive reference count */ + atomic_t xg_active_ref; /* active reference count */ + + /* Precalculated geometry info */ + uint32_t xg_block_count; /* max usable gbno */ + uint32_t xg_min_gbno; /* min usable gbno */ + +#ifdef __KERNEL__ + /* -- kernel only structures below this line -- */ + + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold xg_state_lock before accessing this field. + */ + uint16_t xg_checked; + uint16_t xg_sick; + spinlock_t xg_state_lock; + + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain xg_intents_drain; + + /* + * Hook to feed rmapbt updates to an active online repair. + */ + struct xfs_hooks xg_rmap_update_hooks; +#endif /* __KERNEL__ */ +}; + +struct xfs_group *xfs_group_get(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_get_by_fsb(struct xfs_mount *mp, + xfs_fsblock_t fsbno, enum xfs_group_type type); +struct xfs_group *xfs_group_hold(struct xfs_group *xg); +void xfs_group_put(struct xfs_group *xg); + +struct xfs_group *xfs_group_grab(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_next_range(struct xfs_mount *mp, + struct xfs_group *xg, uint32_t start_index, uint32_t end_index, + enum xfs_group_type type); +struct xfs_group *xfs_group_grab_next_mark(struct xfs_mount *mp, + struct xfs_group *xg, xa_mark_t mark, enum xfs_group_type type); +void xfs_group_rele(struct xfs_group *xg); + +void xfs_group_free(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type, void (*uninit)(struct xfs_group *xg)); +int xfs_group_insert(struct xfs_mount *mp, struct xfs_group *xg, + uint32_t index, enum xfs_group_type); + +#define xfs_group_set_mark(_xg, _mark) \ + xa_set_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_clear_mark(_xg, _mark) \ + xa_clear_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_marked(_mp, _type, _mark) \ + xa_marked(&(_mp)->m_groups[(_type)].xa, (_mark)) + +static inline xfs_agblock_t +xfs_group_max_blocks( + struct xfs_group *xg) +{ + return xg->xg_mount->m_groups[xg->xg_type].blocks; +} + +static inline xfs_fsblock_t +xfs_group_start_fsb( + struct xfs_group *xg) +{ + return ((xfs_fsblock_t)xg->xg_gno) << + xg->xg_mount->m_groups[xg->xg_type].blklog; +} + +static inline xfs_fsblock_t +xfs_gbno_to_fsb( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + return xfs_group_start_fsb(xg) | gbno; +} + +static inline xfs_daddr_t +xfs_gbno_to_daddr( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + struct xfs_mount *mp = xg->xg_mount; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; + + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); +} + +static inline uint32_t +xfs_fsb_to_gno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + if (!mp->m_groups[type].blklog) + return 0; + return fsbno >> mp->m_groups[type].blklog; +} + +static inline xfs_agblock_t +xfs_fsb_to_gbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return fsbno & mp->m_groups[type].blkmask; +} + +static inline bool +xfs_verify_gbno( + struct xfs_group *xg, + uint32_t gbno) +{ + if (gbno >= xg->xg_block_count) + return false; + if (gbno < xg->xg_min_gbno) + return false; + return true; +} + +static inline bool +xfs_verify_gbext( + struct xfs_group *xg, + uint32_t gbno, + uint32_t glen) +{ + uint32_t end; + + if (!xfs_verify_gbno(xg, gbno)) + return false; + if (glen == 0 || check_add_overflow(gbno, glen - 1, &end)) + return false; + if (!xfs_verify_gbno(xg, end)) + return false; + return true; +} + +#endif /* __LIBXFS_GROUP_H */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index b0edb4288e59..b31000f7190c 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -6,6 +6,8 @@ #ifndef __XFS_HEALTH_H__ #define __XFS_HEALTH_H__ +struct xfs_group; + /* * In-Core Filesystem Health Assessments * ===================================== @@ -52,6 +54,7 @@ struct xfs_inode; struct xfs_fsop_geom; struct xfs_btree_cur; struct xfs_da_args; +struct xfs_rtgroup; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ @@ -60,10 +63,15 @@ struct xfs_da_args; #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ #define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ #define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ +#define XFS_SICK_FS_METADIR (1 << 6) /* metadata directory tree */ +#define XFS_SICK_FS_METAPATH (1 << 7) /* metadata directory tree path */ -/* Observable health issues for realtime volume metadata. */ -#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ -#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */ +/* Observable health issues for realtime group metadata. */ +#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */ +#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */ +#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */ +#define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */ +#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */ /* Observable health issues for AG metadata. */ #define XFS_SICK_AG_SB (1 << 0) /* superblock */ @@ -103,10 +111,15 @@ struct xfs_da_args; XFS_SICK_FS_GQUOTA | \ XFS_SICK_FS_PQUOTA | \ XFS_SICK_FS_QUOTACHECK | \ - XFS_SICK_FS_NLINKS) + XFS_SICK_FS_NLINKS | \ + XFS_SICK_FS_METADIR | \ + XFS_SICK_FS_METAPATH) -#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ - XFS_SICK_RT_SUMMARY) +#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \ + XFS_SICK_RG_BITMAP | \ + XFS_SICK_RG_SUMMARY | \ + XFS_SICK_RG_RMAPBT | \ + XFS_SICK_RG_REFCNTBT) #define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ XFS_SICK_AG_AGF | \ @@ -136,26 +149,26 @@ struct xfs_da_args; /* Secondary state related to (but not primary evidence of) health problems. */ #define XFS_SICK_FS_SECONDARY (0) -#define XFS_SICK_RT_SECONDARY (0) +#define XFS_SICK_RG_SECONDARY (0) #define XFS_SICK_AG_SECONDARY (0) #define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET) /* Evidence of health problems elsewhere. */ #define XFS_SICK_FS_INDIRECT (0) -#define XFS_SICK_RT_INDIRECT (0) +#define XFS_SICK_RG_INDIRECT (0) #define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES) #define XFS_SICK_INO_INDIRECT (0) /* All health masks. */ -#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ +#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ XFS_SICK_FS_SECONDARY | \ XFS_SICK_FS_INDIRECT) -#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \ - XFS_SICK_RT_SECONDARY | \ - XFS_SICK_RT_INDIRECT) +#define XFS_SICK_RG_ALL (XFS_SICK_RG_PRIMARY | \ + XFS_SICK_RG_SECONDARY | \ + XFS_SICK_RG_INDIRECT) -#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ +#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ XFS_SICK_AG_SECONDARY | \ XFS_SICK_AG_INDIRECT) @@ -189,18 +202,17 @@ void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); -void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, - unsigned int *checked); +void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno, + unsigned int mask); void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int mask); -void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, +void xfs_group_mark_sick(struct xfs_group *xg, unsigned int mask); +#define xfs_ag_mark_sick(pag, mask) \ + xfs_group_mark_sick(pag_group(pag), (mask)) +void xfs_group_mark_corrupt(struct xfs_group *xg, unsigned int mask); +void xfs_group_mark_healthy(struct xfs_group *xg, unsigned int mask); +void xfs_group_measure_sickness(struct xfs_group *xg, unsigned int *sick, unsigned int *checked); void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); @@ -227,22 +239,25 @@ xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask) } static inline bool -xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask) +xfs_group_has_sickness( + struct xfs_group *xg, + unsigned int mask) { - unsigned int sick, checked; + unsigned int sick, checked; - xfs_rt_measure_sickness(mp, &sick, &checked); + xfs_group_measure_sickness(xg, &sick, &checked); return sick & mask; } -static inline bool -xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask) -{ - unsigned int sick, checked; +#define xfs_ag_has_sickness(pag, mask) \ + xfs_group_has_sickness(pag_group(pag), (mask)) +#define xfs_ag_is_healthy(pag) \ + (!xfs_ag_has_sickness((pag), UINT_MAX)) - xfs_ag_measure_sickness(pag, &sick, &checked); - return sick & mask; -} +#define xfs_rtgroup_has_sickness(rtg, mask) \ + xfs_group_has_sickness(rtg_group(rtg), (mask)) +#define xfs_rtgroup_is_healthy(rtg) \ + (!xfs_rtgroup_has_sickness((rtg), UINT_MAX)) static inline bool xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask) @@ -260,18 +275,6 @@ xfs_fs_is_healthy(struct xfs_mount *mp) } static inline bool -xfs_rt_is_healthy(struct xfs_mount *mp) -{ - return !xfs_rt_has_sickness(mp, -1U); -} - -static inline bool -xfs_ag_is_healthy(struct xfs_perag *pag) -{ - return !xfs_ag_has_sickness(pag, -1U); -} - -static inline bool xfs_inode_is_healthy(struct xfs_inode *ip) { return !xfs_inode_has_sickness(ip, -1U); @@ -279,6 +282,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip) void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); #define xfs_metadata_is_sick(error) \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 271855227514..0c47b5c6ca7d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -142,7 +142,7 @@ xfs_inobt_complain_bad_rec( xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", - cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -170,7 +170,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -275,8 +275,10 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!xfs_is_shutdown(cur->bc_mp)) - ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); + if (!xfs_is_shutdown(cur->bc_mp)) { + ASSERT(freecount == + to_perag(cur->bc_group)->pagi_freecount); + } } return 0; } @@ -362,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -551,7 +553,7 @@ xfs_inobt_insert_sprec( struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; @@ -606,15 +608,12 @@ xfs_inobt_insert_sprec( goto error; } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(pag, &rec, nrec); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(pag, nrec); error = xfs_inobt_rec_check_count(mp, nrec); if (error) @@ -648,7 +647,7 @@ xfs_finobt_insert_sprec( struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; @@ -768,8 +767,7 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - args.agbno)); + xfs_agbno_to_fsb(pag, args.agbno)); if (error) return error; @@ -811,8 +809,8 @@ xfs_ialloc_ag_alloc( */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -824,8 +822,8 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -855,13 +853,14 @@ sparse_alloc: * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; - args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.max_agbno = round_down(xfs_ag_block_count(args.mp, + pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -884,7 +883,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) @@ -915,8 +914,7 @@ sparse_alloc: if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, pag->pag_agno, - rec.ir_startino), + xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1076,7 +1074,7 @@ xfs_dialloc_check_ino( if (error) return -EAGAIN; - error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp); + error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); if (error) return -EAGAIN; @@ -1127,7 +1125,7 @@ xfs_dialloc_ag_inobt( /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == pag->pag_agno) { + if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1335,7 +1333,7 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); @@ -1604,7 +1602,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (pag->pag_agno == pagno) + if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1616,7 +1614,7 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); @@ -1845,6 +1843,40 @@ out_release: } /* + * Pick an AG for the new inode. + * + * Directories, symlinks, and regular files frequently allocate at least one + * block, so factor that potential expansion when we examine whether an AG has + * enough space for file creation. Try to keep metadata files all in the same + * AG. + */ +static inline xfs_agnumber_t +xfs_dialloc_pick_ag( + struct xfs_mount *mp, + struct xfs_inode *dp, + umode_t mode) +{ + xfs_agnumber_t start_agno; + + if (!dp) + return 0; + if (xfs_is_metadir_inode(dp)) { + if (mp->m_sb.sb_logstart) + return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); + return 0; + } + + if (S_ISDIR(mode)) + return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; + + start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + + return start_agno; +} + +/* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to @@ -1859,31 +1891,19 @@ xfs_dialloc( xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; + struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_ino_t ino = NULLFSINO; xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; - umode_t mode = args->mode & S_IFMT; xfs_agnumber_t agno; - int error = 0; xfs_agnumber_t start_agno; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); + umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; bool low_space = false; int flags; - xfs_ino_t ino = NULLFSINO; + int error = 0; - /* - * Directories, symlinks, and regular files frequently allocate at least - * one block, so factor that potential expansion when we examine whether - * an AG has enough space for file creation. - */ - if (S_ISDIR(mode)) - start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % - mp->m_maxagi; - else { - start_agno = XFS_INO_TO_AGNO(mp, parent); - if (start_agno >= mp->m_maxagi) - start_agno = 0; - } + start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear @@ -1907,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; @@ -1974,7 +1994,7 @@ retry: static int xfs_difree_inode_chunk( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; @@ -1988,8 +2008,7 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - return xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, sagbno), + return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); } @@ -2035,9 +2054,9 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), + contigblk, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -2059,7 +2078,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2124,8 +2143,7 @@ xfs_difree_inobt( if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - rec.ir_startino); + xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2148,7 +2166,7 @@ xfs_difree_inobt( goto error0; } - error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag, &rec); if (error) goto error0; } else { @@ -2194,7 +2212,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; @@ -2317,24 +2335,24 @@ xfs_difree( /* * Break up inode number into its components. */ - if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { - xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", - __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); + if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + if (inode != xfs_agino_to_ino(pag, agino)) { + xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { + xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", + __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } @@ -2380,7 +2398,7 @@ xfs_imap_lookup( xfs_agblock_t *offset_agbno, int flags) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; @@ -2391,7 +2409,7 @@ xfs_imap_lookup( if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, pag->pag_agno); + __func__, error, pag_agno(pag)); return error; } @@ -2441,7 +2459,7 @@ xfs_imap( struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2457,8 +2475,8 @@ xfs_imap( */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || + ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* @@ -2467,17 +2485,18 @@ xfs_imap( */ if (flags & XFS_IGET_UNTRUSTED) return error; - if (agbno >= mp->m_sb.sb_agblocks) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, - (unsigned long)mp->m_sb.sb_agblocks); + (unsigned long)xfs_ag_block_count(mp, + pag_agno(pag))); } - if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, - "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ @@ -2507,7 +2526,7 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2537,7 +2556,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2733,13 +2752,13 @@ xfs_read_agi( xfs_buf_flags_t flags, struct xfs_buf **agibpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); @@ -2767,7 +2786,7 @@ xfs_ialloc_read_agi( struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_ialloc_read_agi(pag); error = xfs_read_agi(pag, tp, (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, @@ -2787,7 +2806,7 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag->pag_mount)); + xfs_is_shutdown(pag_mount(pag))); if (agibpp) *agibpp = agibp; else @@ -2887,7 +2906,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); @@ -3126,13 +3145,13 @@ xfs_ialloc_check_shrink( int has; int error; - if (!xfs_has_sparseinodes(pag->pag_mount)) + if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ - agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); + agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 401b42d52af6..6f270d8f4270 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -37,7 +37,7 @@ STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + return xfs_inobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp); } @@ -45,7 +45,7 @@ STATIC struct xfs_btree_cur * xfs_finobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + return xfs_finobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp); } @@ -112,7 +112,7 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_INOBT; args.minlen = 1; args.maxlen = 1; @@ -120,7 +120,7 @@ __xfs_inobt_alloc_block( args.resv = resv; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); + xfs_agbno_to_fsb(args.pag, sbno)); if (error) return error; @@ -248,7 +248,7 @@ xfs_inobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -260,7 +260,8 @@ xfs_finobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; } @@ -478,12 +479,12 @@ xfs_inobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; @@ -504,12 +505,12 @@ xfs_finobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; @@ -715,8 +716,8 @@ static xfs_extlen_t xfs_inobt_max_size( struct xfs_perag *pag) { - struct xfs_mount *mp = pag->pag_mount; - xfs_agblock_t agblocks = pag->block_count; + struct xfs_mount *mp = pag_mount(pag); + xfs_agblock_t agblocks = pag_group(pag)->xg_block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -727,7 +728,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -743,6 +744,7 @@ xfs_finobt_count_blocks( { struct xfs_buf *agbp = NULL; struct xfs_btree_cur *cur; + xfs_filblks_t blocks; int error; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); @@ -750,9 +752,10 @@ xfs_finobt_count_blocks( return error; cur = xfs_finobt_init_cursor(pag, tp, agbp); - error = xfs_btree_count_blocks(cur, tree_blocks); + error = xfs_btree_count_blocks(cur, &blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); + *tree_blocks = blocks; return error; } @@ -791,10 +794,10 @@ xfs_finobt_calc_reserves( xfs_extlen_t tree_len = 0; int error; - if (!xfs_has_finobt(pag->pag_mount)) + if (!xfs_has_finobt(pag_mount(pag))) return 0; - if (xfs_has_inobtcounts(pag->pag_mount)) + if (xfs_has_inobtcounts(pag_mount(pag))) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_finobt_count_blocks(pag, tp, &tree_len); diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 79babeac9d75..aa13fc00afd7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -19,6 +19,7 @@ #include "xfs_ialloc.h" #include "xfs_dir2.h" #include "xfs_health.h" +#include "xfs_metafile.h" #include <linux/iversion.h> @@ -136,7 +137,7 @@ xfs_imap_to_bp( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + imap->im_len, 0, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), XFS_SICK_AG_INODES); @@ -209,12 +210,15 @@ xfs_inode_from_disk( * They will also be unconditionally written back to disk as v2 inodes. */ if (unlikely(from->di_version == 1)) { - set_nlink(inode, be16_to_cpu(from->di_onlink)); + /* di_metatype used to be di_onlink */ + set_nlink(inode, be16_to_cpu(from->di_metatype)); ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); + if (xfs_dinode_is_metadir(from)) + ip->i_metatype = be16_to_cpu(from->di_metatype); } i_uid_write(inode, be32_to_cpu(from->di_uid)); @@ -248,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -315,7 +322,10 @@ xfs_inode_to_disk( struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - to->di_onlink = 0; + if (xfs_is_metadir_inode(ip)) + to->di_metatype = cpu_to_be16(ip->i_metatype); + else + to->di_metatype = 0; to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); @@ -342,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -434,6 +445,30 @@ xfs_dinode_verify_fork( if (di_nextents > max_extents) return __this_address; break; + case XFS_DINODE_FMT_META_BTREE: + if (!xfs_has_metadir(mp)) + return __this_address; + if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA))) + return __this_address; + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + /* + * growfs must create the rtrmap inodes before adding a + * realtime volume to the filesystem, so we cannot use + * the rtrmapbt predicate here. + */ + if (!xfs_has_rmapbt(mp)) + return __this_address; + break; + case XFS_METAFILE_RTREFCOUNT: + /* same comment about growfs and rmap inodes applies */ + if (!xfs_has_reflink(mp)) + return __this_address; + break; + default: + return __this_address; + } + break; default: return __this_address; } @@ -453,6 +488,10 @@ xfs_dinode_verify_forkoff( if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) return __this_address; break; + case XFS_DINODE_FMT_META_BTREE: + if (!xfs_has_metadir(mp) || !xfs_has_parent(mp)) + return __this_address; + fallthrough; case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: @@ -483,6 +522,69 @@ xfs_dinode_verify_nrext64( return NULL; } +/* + * Validate all the picky requirements we have for a file that claims to be + * filesystem metadata. + */ +xfs_failaddr_t +xfs_dinode_verify_metadir( + struct xfs_mount *mp, + struct xfs_dinode *dip, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + if (!xfs_has_metadir(mp)) + return __this_address; + + /* V5 filesystem only */ + if (dip->di_version < 3) + return __this_address; + + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + return __this_address; + + /* V3 inode fields that are always zero */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad) + return __this_address; + if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter) + return __this_address; + + /* Metadata files can only be directories or regular files */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* They must have zero access permissions */ + if (mode & 0777) + return __this_address; + + /* DMAPI event and state masks are zero */ + if (dip->di_dmevmask || dip->di_dmstate) + return __this_address; + + /* + * User and group IDs must be zero. The project ID is used for + * grouping inodes. Metadata inodes are never accounted to quotas. + */ + if (dip->di_uid || dip->di_gid) + return __this_address; + + /* Mandatory inode flags must be set */ + if (S_ISDIR(mode)) { + if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS) + return __this_address; + } else { + if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS) + return __this_address; + } + + /* dax flags2 must not be set */ + if (flags2 & XFS_DIFLAG2_DAX) + return __this_address; + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -523,8 +625,11 @@ xfs_dinode_verify( * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with * di_onlink==0, so we can check that. */ - if (dip->di_version >= 2) { - if (dip->di_onlink) + if (dip->di_version == 2) { + if (dip->di_metatype) + return __this_address; + } else if (dip->di_version >= 3) { + if (!xfs_dinode_is_metadir(dip) && dip->di_metatype) return __this_address; } @@ -546,7 +651,8 @@ xfs_dinode_verify( if (dip->di_nlink) return __this_address; } else { - if (dip->di_onlink) + /* di_metatype used to be di_onlink */ + if (dip->di_metatype) return __this_address; } } @@ -563,9 +669,6 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; - if (nextents + naextents == 0 && nblocks != 0) - return __this_address; - if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; @@ -649,20 +752,40 @@ xfs_dinode_verify( return __this_address; /* don't let reflink and realtime mix */ - if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && + !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_METADATA) { + fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); + if (fa) + return fa; + } + + /* metadata inodes containing btrees always have zero extent count */ + if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) { + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + } + return NULL; } @@ -798,11 +921,29 @@ xfs_inode_validate_cowextsize( bool rt_flag; bool hint_flag; uint32_t cowextsize_bytes; + uint32_t blocksize_bytes; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); + /* + * Similar to extent size hints, a directory can be configured to + * propagate realtime status and a CoW extent size hint to newly + * created files even if there is no realtime device, and the hints on + * disk can become misaligned if the sysadmin changes the rt extent + * size while adding the realtime device. + * + * Therefore, we can only enforce the rextsize alignment check against + * regular realtime files, and rely on callers to decide when alignment + * checks are appropriate, and fix things up as needed. + */ + + if (rt_flag) + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + else + blocksize_bytes = mp->m_sb.sb_blocksize; + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; @@ -816,16 +957,13 @@ xfs_inode_validate_cowextsize( if (mode && !hint_flag && cowextsize != 0) return __this_address; - if (hint_flag && rt_flag) - return __this_address; - - if (cowextsize_bytes % mp->m_sb.sb_blocksize) + if (cowextsize_bytes % blocksize_bytes) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; - if (cowextsize > mp->m_sb.sb_agblocks / 2) + if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) return __this_address; return NULL; diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 585ed5a110af..8d43d2641c73 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -28,6 +28,9 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); +xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp, + struct xfs_dinode *dip, uint16_t mode, uint16_t flags, + uint64_t flags2); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, uint32_t extsize, uint16_t mode, uint16_t flags); xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 1158ca48626b..4f99b90add55 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -27,6 +27,8 @@ #include "xfs_errortag.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_ifork_cache; @@ -178,7 +180,7 @@ xfs_iformat_btree( struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; struct xfs_ifork *ifp; - /* REFERENCED */ + struct xfs_btree_block *broot; int nrecs; int size; int level; @@ -211,16 +213,13 @@ xfs_iformat_btree( return -EFSCORRUPTED; } - ifp->if_broot_bytes = size; - ifp->if_broot = kmalloc(size, - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - ASSERT(ifp->if_broot != NULL); + broot = xfs_broot_alloc(ifp, size); /* * Copy and convert from the on-disk structure * to the in-memory structure. */ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); + broot, size); ifp->if_bytes = 0; ifp->if_data = NULL; @@ -270,6 +269,16 @@ xfs_iformat_data_fork( return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); case XFS_DINODE_FMT_BTREE: return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_META_BTREE: + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + return xfs_iformat_rtrmap(ip, dip); + case XFS_METAFILE_RTREFCOUNT: + return xfs_iformat_rtrefcount(ip, dip); + default: + break; + } + fallthrough; default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); @@ -363,135 +372,68 @@ xfs_iformat_attr_fork( } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we are adding records, one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. + * Allocate the if_broot component of an inode fork so that it is @new_size + * bytes in size, using __GFP_NOLOCKDEP like all the other code that + * initializes a broot during inode load. Returns if_broot. */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) +struct xfs_btree_block * +xfs_broot_alloc( + struct xfs_ifork *ifp, + size_t new_size) { - struct xfs_mount *mp = ip->i_mount; - int cur_max; - struct xfs_ifork *ifp; - struct xfs_btree_block *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; + ASSERT(ifp->if_broot == NULL); - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } + ifp->if_broot = kmalloc(new_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + return ifp->if_broot; +} - ifp = xfs_ifork_ptr(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = xfs_bmap_broot_space_calc(mp, rec_diff); - ifp->if_broot = kmalloc(new_size, - GFP_KERNEL | __GFP_NOFAIL); - ifp->if_broot_bytes = (int)new_size; - return; - } +/* + * Reallocate the if_broot component of an inode fork so that it is @new_size + * bytes in size. Returns if_broot. + */ +struct xfs_btree_block * +xfs_broot_realloc( + struct xfs_ifork *ifp, + size_t new_size) +{ + /* No size change? No action needed. */ + if (new_size == ifp->if_broot_bytes) + return ifp->if_broot; - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false); - new_max = cur_max + rec_diff; - new_size = xfs_bmap_broot_space_calc(mp, new_max); - ifp->if_broot = krealloc(ifp->if_broot, new_size, - GFP_KERNEL | __GFP_NOFAIL); - op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= - xfs_inode_fork_size(ip, whichfork)); - memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); - return; + /* New size is zero, free it. */ + if (new_size == 0) { + ifp->if_broot_bytes = 0; + kfree(ifp->if_broot); + ifp->if_broot = NULL; + return NULL; } /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. + * Shrinking the iroot means we allocate a new smaller object and copy + * it. We don't trust krealloc not to nop on realloc-down. */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = xfs_bmap_broot_space_calc(mp, new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, - xfs_bmbt_block_len(ip->i_mount)); - } else { - new_broot = NULL; + if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) { + struct xfs_btree_block *old_broot = ifp->if_broot; + + ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + memcpy(ifp->if_broot, old_broot, new_size); + kfree(old_broot); + return ifp->if_broot; } /* - * Only copy the keys and pointers if there are any. + * Growing the iroot means we can krealloc. This may get us the same + * object. */ - if (new_max > 0) { - /* - * First copy the keys. - */ - op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1); - np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t)); - - /* - * Then copy the pointers. - */ - op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); - } - kfree(ifp->if_broot); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - if (ifp->if_broot) - ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= - xfs_inode_fork_size(ip, whichfork)); - return; + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_KERNEL | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + return ifp->if_broot; } - /* * This is called when the amount of space needed for if_data * is increased or decreased. The change in size is indicated by @@ -671,6 +613,25 @@ xfs_iflush_fork( } break; + case XFS_DINODE_FMT_META_BTREE: + ASSERT(whichfork == XFS_DATA_FORK); + + if (!(iip->ili_fields & brootflag[whichfork])) + break; + + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + xfs_iflush_rtrmap(ip, dip); + break; + case XFS_METAFILE_RTREFCOUNT: + xfs_iflush_rtrefcount(ip, dip); + break; + default: + ASSERT(0); + break; + } + break; + default: ASSERT(0); break; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 2373d12fd474..69ed0919d60b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -170,7 +170,11 @@ void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, void xfs_idestroy_fork(struct xfs_ifork *ifp); void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); -void xfs_iroot_realloc(struct xfs_inode *, int, int); +struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp, + size_t new_size); +struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp, + size_t new_size); + int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index cc38e1c3c3e1..48fe49a5f050 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -224,6 +224,8 @@ xfs_inode_inherit_flags2( } if (pip->i_diflags2 & XFS_DIFLAG2_DAX) ip->i_diflags2 |= XFS_DIFLAG2_DAX; + if (xfs_is_metadir_inode(pip)) + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; /* Don't let invalid cowextsize hints propagate. */ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, @@ -320,6 +322,7 @@ xfs_inode_init( if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = 0; times |= XFS_ICHGTIME_CREATE; } @@ -442,8 +445,8 @@ xfs_iunlink_update_bucket( ASSERT(xfs_verify_agino_or_null(pag, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); - trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, - old_value, new_agino); + trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value, + new_agino); /* * We should never find the head of the list already set to the value diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 3e6682ed656b..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -248,6 +248,12 @@ typedef struct xfs_trans_header { #define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_XMI 0x1248 /* mapping exchange intent */ #define XFS_LI_XMD 0x1249 /* mapping exchange done */ +#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */ +#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */ +#define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */ +#define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */ +#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */ +#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -267,7 +273,13 @@ typedef struct xfs_trans_header { { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \ { XFS_LI_XMI, "XFS_LI_XMI" }, \ - { XFS_LI_XMD, "XFS_LI_XMD" } + { XFS_LI_XMD, "XFS_LI_XMD" }, \ + { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \ + { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \ + { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \ + { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \ + { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \ + { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" } /* * Inode Log Item Format definitions. @@ -347,12 +359,6 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_IVERSION 0x8000 -#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ - XFS_ILOG_AOWNER) - #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -404,7 +410,7 @@ struct xfs_log_dinode { uint16_t di_mode; /* mode and type of file */ int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ - uint8_t di_pad3[2]; /* unused in v2/3 inodes */ + uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */ uint32_t di_uid; /* owner's user id */ uint32_t di_gid; /* owner's group id */ uint32_t di_nlink; /* number of links to file */ @@ -469,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 521d327e4c89..66c7916fb5cd 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -77,6 +77,12 @@ extern const struct xlog_recover_item_ops xlog_attri_item_ops; extern const struct xlog_recover_item_ops xlog_attrd_item_ops; extern const struct xlog_recover_item_ops xlog_xmi_item_ops; extern const struct xlog_recover_item_ops xlog_xmd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefi_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtrui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtrud_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcud_item_ops; /* * Macros, structures, prototypes for internal log manager use. diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index d3bd6a86c8fe..34bba96d30ca 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks( */ if (xfs_want_minlogsize_fixes(&mp->m_sb)) { xfs_trans_resv_calc(mp, resv); + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; return; } @@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c new file mode 100644 index 000000000000..178e89711cb7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_bmap_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_trans_space.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_parent.h" +#include "xfs_health.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" + +/* + * Metadata Directory Tree + * ======================= + * + * These functions provide an abstraction layer for looking up, creating, and + * deleting metadata inodes that live within a special metadata directory tree. + * + * This code does not manage the five existing metadata inodes: real time + * bitmap & summary; and the user, group, and quotas. All other metadata + * inodes must use only the xfs_meta{dir,file}_* functions. + * + * Callers wishing to create or hardlink a metadata inode must create an + * xfs_metadir_update structure, call the appropriate xfs_metadir* function, + * and then call xfs_metadir_commit or xfs_metadir_cancel to commit or cancel + * the update. Files in the metadata directory tree currently cannot be + * unlinked. + * + * When the metadir feature is enabled, all metadata inodes must have the + * "metadata" inode flag set to prevent them from being exposed to the outside + * world. + * + * Callers must take the ILOCK of any inode in the metadata directory tree to + * synchronize access to that inode. It is never necessary to take the IOLOCK + * or the MMAPLOCK since metadata inodes must not be exposed to user space. + */ + +static inline void +xfs_metadir_set_xname( + struct xfs_name *xname, + const char *path, + unsigned char ftype) +{ + xname->name = (const unsigned char *)path; + xname->len = strlen(path); + xname->type = ftype; +} + +/* + * Given a parent directory @dp and a metadata inode path component @xname, + * Look up the inode number in the directory, returning it in @ino. + * @xname.type must match the directory entry's ftype. + * + * Caller must hold ILOCK_EXCL. + */ +static inline int +xfs_metadir_lookup( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_name *xname, + xfs_ino_t *ino) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args = { + .trans = tp, + .dp = dp, + .geo = mp->m_dir_geo, + .name = xname->name, + .namelen = xname->len, + .hashval = xfs_dir2_hashname(mp, xname), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, + }; + int error; + + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_dir_lookup_args(&args); + if (error) + return error; + + if (!xfs_verify_ino(mp, args.inumber)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xname->type != XFS_DIR3_FT_UNKNOWN && xname->type != args.filetype) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + trace_xfs_metadir_lookup(dp, xname, args.inumber); + *ino = args.inumber; + return 0; +} + +/* + * Look up and read a metadata inode from the metadata directory. If the path + * component doesn't exist, return -ENOENT. + */ +int +xfs_metadir_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + const char *path, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_name xname; + xfs_ino_t ino; + int error; + + xfs_metadir_set_xname(&xname, path, XFS_DIR3_FT_UNKNOWN); + + xfs_ilock(dp, XFS_ILOCK_EXCL); + error = xfs_metadir_lookup(tp, dp, &xname, &ino); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (error) + return error; + return xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); +} + +/* + * Unlock and release resources after committing (or cancelling) a metadata + * directory tree operation. The caller retains its reference to @upd->ip + * and must release it explicitly. + */ +static inline void +xfs_metadir_teardown( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_teardown(upd, error); + + if (upd->ppargs) { + xfs_parent_finish(upd->dp->i_mount, upd->ppargs); + upd->ppargs = NULL; + } + + if (upd->ip) { + if (upd->ip_locked) + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + upd->ip_locked = false; + } + + if (upd->dp_locked) + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + upd->dp_locked = false; +} + +/* + * Begin the process of creating a metadata file by allocating transactions + * and taking whatever resources we're going to need. + */ +int +xfs_metadir_start_create( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip == NULL); + ASSERT(xfs_has_metadir(mp)); + ASSERT(upd->metafile_type != XFS_METAFILE_UNKNOWN); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + /* + * If we ever need the ability to create rt metadata files on a + * pre-metadir filesystem, we'll need to dqattach the parent here. + * Currently we assume that mkfs will create the files and quotacheck + * will account for them. + */ + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp); + if (error) + goto out_teardown; + + /* + * Lock the parent directory if there is one. We can't ijoin it to + * the transaction until after the child file has been created. + */ + xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + upd->dp_locked = true; + + trace_xfs_metadir_start_create(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Create a metadata inode with the given @mode, and insert it into the + * metadata directory tree at the given @upd->path. The path up to the final + * component must already exist. The final path component must not exist. + * + * The new metadata inode will be attached to the update structure @upd->ip, + * with the ILOCK held until the caller releases it. + * + * NOTE: This function may return a new inode to the caller even if it returns + * a negative error code. If an inode is passed back, the caller must finish + * setting up the inode before releasing it. + */ +int +xfs_metadir_create( + struct xfs_metadir_update *upd, + umode_t mode) +{ + struct xfs_icreate_args args = { + .pip = upd->dp, + .mode = mode, + }; + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + + /* Check that the name does not already exist in the directory. */ + xfs_metadir_set_xname(&xname, upd->path, XFS_DIR3_FT_UNKNOWN); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dialloc(&upd->tp, &args, &ino); + if (error) + return error; + error = xfs_icreate(upd->tp, ino, &args, &upd->ip); + if (error) + return error; + du.ip = upd->ip; + xfs_metafile_set_iflag(upd->tp, upd->ip, upd->metafile_type); + upd->ip_locked = true; + + /* + * Join the directory inode to the transaction. We do not do it + * earlier because xfs_dialloc rolls the transaction. + */ + xfs_trans_ijoin(upd->tp, upd->dp, 0); + + /* Create the entry. */ + if (S_ISDIR(args.mode)) + resblks = xfs_mkdir_space_res(mp, xname.len); + else + resblks = xfs_create_space_res(mp, xname.len); + xname.type = xfs_mode_to_ftype(args.mode); + + trace_xfs_metadir_try_create(upd); + + error = xfs_dir_create_child(upd->tp, resblks, &du); + if (error) + return error; + + /* Metadir files are not accounted to quota. */ + + trace_xfs_metadir_create(upd); + + return 0; +} + +#ifndef __KERNEL__ +/* + * Begin the process of linking a metadata file by allocating transactions + * and locking whatever resources we're going to need. + */ +int +xfs_metadir_start_link( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + unsigned int resblks; + int nospace_error = 0; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip != NULL); + ASSERT(xfs_has_metadir(mp)); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + resblks = xfs_link_space_res(mp, MAXNAMELEN); + error = xfs_trans_alloc_dir(upd->dp, &M_RES(mp)->tr_link, upd->ip, + &resblks, &upd->tp, &nospace_error); + if (error) + goto out_teardown; + if (!resblks) { + /* We don't allow reservationless updates. */ + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + error = nospace_error; + goto out_teardown; + } + + upd->dp_locked = true; + upd->ip_locked = true; + + trace_xfs_metadir_start_link(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Link the metadata directory given by @path to the inode @upd->ip. + * The path (up to the final component) must already exist, but the final + * component must not already exist. + */ +int +xfs_metadir_link( + struct xfs_metadir_update *upd) +{ + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ip = upd->ip, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + xfs_assert_ilocked(upd->ip, XFS_ILOCK_EXCL); + + /* Look up the name in the current directory. */ + xfs_metadir_set_xname(&xname, upd->path, + xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode)); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + resblks = xfs_link_space_res(mp, xname.len); + error = xfs_dir_add_child(upd->tp, resblks, &du); + if (error) + return error; + + trace_xfs_metadir_link(upd); + + return 0; +} +#endif /* ! __KERNEL__ */ + +/* Commit a metadir update and unlock/drop all resources. */ +int +xfs_metadir_commit( + struct xfs_metadir_update *upd) +{ + int error; + + trace_xfs_metadir_commit(upd); + + error = xfs_trans_commit(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); + return error; +} + +/* Cancel a metadir update and unlock/drop all resources. */ +void +xfs_metadir_cancel( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_cancel(upd); + + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); +} + +/* Create a metadata for the last component of the path. */ +int +xfs_metadir_mkdir( + struct xfs_inode *dp, + const char *path, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .path = path, + .metafile_type = XFS_METAFILE_DIR, + }; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + /* Allocate a transaction to create the last directory. */ + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + /* Create the subdirectory and take our reference. */ + error = xfs_metadir_create(&upd, S_IFDIR); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_irele; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); +out_irele: + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } + return error; +} diff --git a/fs/xfs/libxfs/xfs_metadir.h b/fs/xfs/libxfs/xfs_metadir.h new file mode 100644 index 000000000000..bfecac7d3d14 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METADIR_H__ +#define __XFS_METADIR_H__ + +/* Cleanup widget for metadata inode creation and deletion. */ +struct xfs_metadir_update { + /* Parent directory */ + struct xfs_inode *dp; + + /* Path to metadata file */ + const char *path; + + /* Parent pointer update context */ + struct xfs_parent_args *ppargs; + + /* Child metadata file */ + struct xfs_inode *ip; + + struct xfs_trans *tp; + + enum xfs_metafile_type metafile_type; + + unsigned int dp_locked:1; + unsigned int ip_locked:1; +}; + +int xfs_metadir_load(struct xfs_trans *tp, struct xfs_inode *dp, + const char *path, enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp); + +int xfs_metadir_start_create(struct xfs_metadir_update *upd); +int xfs_metadir_create(struct xfs_metadir_update *upd, umode_t mode); + +int xfs_metadir_start_link(struct xfs_metadir_update *upd); +int xfs_metadir_link(struct xfs_metadir_update *upd); + +int xfs_metadir_commit(struct xfs_metadir_update *upd); +void xfs_metadir_cancel(struct xfs_metadir_update *upd, int error); + +int xfs_metadir_mkdir(struct xfs_inode *dp, const char *path, + struct xfs_inode **ipp); + +#endif /* __XFS_METADIR_H__ */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c new file mode 100644 index 000000000000..225923e463c4 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" + +static const struct { + enum xfs_metafile_type mtype; + const char *name; +} xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR }; + +const char * +xfs_metafile_type_str(enum xfs_metafile_type metatype) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) { + if (xfs_metafile_type_strs[i].mtype == metatype) + return xfs_metafile_type_strs[i].name; + } + + return NULL; +} + +/* Set up an inode to be recognized as a metadata directory inode. */ +void +xfs_metafile_set_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip, + enum xfs_metafile_type metafile_type) +{ + VFS_I(ip)->i_mode &= ~0777; + VFS_I(ip)->i_uid = GLOBAL_ROOT_UID; + VFS_I(ip)->i_gid = GLOBAL_ROOT_GID; + if (S_ISDIR(VFS_I(ip)->i_mode)) + ip->i_diflags |= XFS_METADIR_DIFLAGS; + else + ip->i_diflags |= XFS_METAFILE_DIFLAGS; + ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; + ip->i_metatype = metafile_type; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Clear the metadata directory inode flag. */ +void +xfs_metafile_clear_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(VFS_I(ip)->i_nlink == 0); + + ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * Is the metafile reservations at or beneath a certain threshold? + */ +static inline bool +xfs_metafile_resv_can_cover( + struct xfs_mount *mp, + int64_t rhs) +{ + /* + * The amount of space that can be allocated to this metadata file is + * the remaining reservation for the particular metadata file + the + * global free block count. Take care of the first case to avoid + * touching the per-cpu counter. + */ + if (mp->m_metafile_resv_avail >= rhs) + return true; + + /* + * There aren't enough blocks left in the inode's reservation, but it + * isn't critical unless there also isn't enough free space. + */ + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; +} + +/* + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). + */ +bool +xfs_metafile_resv_critical( + struct xfs_mount *mp) +{ + ASSERT(xfs_has_metadir(mp)); + + trace_xfs_metafile_resv_critical(mp, 0); + + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) + return true; + + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) + return true; + + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); +} + +/* Allocate a block from the metadata file's reservation. */ +void +xfs_metafile_resv_alloc_space( + struct xfs_inode *ip, + struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = ip->i_mount; + int64_t len = args->len; + + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(args->resv == XFS_AG_RESV_METAFILE); + + trace_xfs_metafile_resv_alloc_space(mp, args->len); + + /* + * Allocate the blocks from the metadata inode's block reservation + * and update the ondisk sb counter. + */ + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { + int64_t from_resv; + + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; + xfs_mod_delalloc(ip, 0, -from_resv); + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, + -from_resv); + len -= from_resv; + } + + /* + * Any allocation in excess of the reservation requires in-core and + * on-disk fdblocks updates. If we can grab @len blocks from the + * in-core fdblocks then all we need to do is update the on-disk + * superblock; if not, then try to steal some from the transaction's + * block reservation. Overruns are only expected for rmap btrees. + */ + if (len) { + unsigned int field; + int error; + + error = xfs_dec_fdblocks(ip->i_mount, len, true); + if (error) + field = XFS_TRANS_SB_FDBLOCKS; + else + field = XFS_TRANS_SB_RES_FDBLOCKS; + + xfs_trans_mod_sb(args->tp, field, -len); + } + + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + + ip->i_nblocks += args->len; + xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); +} + +/* Free a block to the metadata file's reservation. */ +void +xfs_metafile_resv_free_space( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_filblks_t len) +{ + struct xfs_mount *mp = ip->i_mount; + int64_t to_resv; + + ASSERT(xfs_is_metadir_inode(ip)); + + trace_xfs_metafile_resv_free_space(mp, len); + + ip->i_nblocks -= len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + + /* + * Add the freed blocks back into the inode's delalloc reservation + * until it reaches the maximum size. Update the ondisk fdblocks only. + */ + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); + if (to_resv > 0) { + to_resv = min_t(int64_t, to_resv, len); + mp->m_metafile_resv_avail += to_resv; + xfs_mod_delalloc(ip, 0, to_resv); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); + len -= to_resv; + } + mutex_unlock(&mp->m_metafile_resv_lock); + + /* + * Everything else goes back to the filesystem, so update the in-core + * and on-disk counters. + */ + if (len) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); +} + +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ +void +xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (!xfs_has_metadir(mp)) + return; + + trace_xfs_metafile_resv_free(mp, 0); + + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); +} + +/* Set up a metafile space reservation. */ +int +xfs_metafile_resv_init( + struct xfs_mount *mp) +{ + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; + xfs_filblks_t hidden_space; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; + + if (!xfs_has_metadir(mp)) + return 0; + + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; + + /* + * Space taken by the per-AG metadata btrees are accounted on-disk as + * used space. We therefore only hide the space that is reserved but + * not used by the trees. + */ + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; + + error = xfs_dec_fdblocks(mp, hidden_space, true); + if (error) { + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; + } + + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); + +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; +} diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h new file mode 100644 index 000000000000..ae6f9e779b98 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METAFILE_H__ +#define __XFS_METAFILE_H__ + +const char *xfs_metafile_type_str(enum xfs_metafile_type metatype); + +/* All metadata files must have these flags set. */ +#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \ + XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | \ + XFS_DIFLAG_NODUMP | \ + XFS_DIFLAG_NODEFRAG) + +/* All metadata directories must have these flags set. */ +#define XFS_METADIR_DIFLAGS (XFS_METAFILE_DIFLAGS | \ + XFS_DIFLAG_NOSYMLINKS) + +void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip, + enum xfs_metafile_type metafile_type); +void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); + +/* Space reservations for metadata inodes. */ +struct xfs_alloc_arg; + +bool xfs_metafile_resv_critical(struct xfs_mount *mp); +void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, + struct xfs_alloc_arg *args); +void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, + xfs_filblks_t len); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); + +/* Code specific to kernel/userspace; must be provided externally. */ + +int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); +int xfs_metafile_iget(struct xfs_mount *mp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); + +#endif /* __XFS_METAFILE_H__ */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 23c133fd36f5..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -19,40 +19,46 @@ static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) +#define XFS_CHECK_SB_OFFSET(field, offset) \ + XFS_CHECK_OFFSET(struct xfs_dsb, field, offset); \ + XFS_CHECK_OFFSET(struct xfs_sb, field, offset); + static inline void __init xfs_check_ondisk_structs(void) { - /* ag/file structures */ + /* file structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); - XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); - XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136); - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264); XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); + + /* space btrees */ + XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); + XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); + XFS_CHECK_STRUCT_SIZE(struct xfs_alloc_rec, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_bmdr_key_t, 8); /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); @@ -67,33 +73,38 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4); /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb, 56); XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48); + XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4); + XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4); /* - * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to - * 4 bytes anyway so it's not obviously a problem. Hence for the moment - * we don't check this structure. This can be re-instated when the attr - * definitions are updated to use c99 VLA definitions. + * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad + * it to 4 bytes anyway so it's not obviously a problem. Hence for the + * moment we don't check this structure. This can be re-instated when + * the attr definitions are updated to use c99 VLA definitions. * - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_remote, 12); */ - XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc, 224); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, valuelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, namelen, 2); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, nameval, 3); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valueblk, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valuelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, namelen, 8); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, name, 9); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leafblock, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); @@ -101,27 +112,41 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); - XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); - XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_blkinfo, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, offset, 1); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, name, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12); + /* ondisk dir/attr structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); + /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); @@ -157,6 +182,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); + /* ondisk log structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_unmount_log_format, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmi_log_format, 88); + /* parent pointer ioctls */ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40); @@ -201,6 +231,72 @@ xfs_check_ondisk_structs(void) XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, 16299260424LL); + + /* superblock field checks we got from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); + XFS_CHECK_SB_OFFSET(sb_magicnum, 0); + XFS_CHECK_SB_OFFSET(sb_blocksize, 4); + XFS_CHECK_SB_OFFSET(sb_dblocks, 8); + XFS_CHECK_SB_OFFSET(sb_rblocks, 16); + XFS_CHECK_SB_OFFSET(sb_rextents, 24); + XFS_CHECK_SB_OFFSET(sb_uuid, 32); + XFS_CHECK_SB_OFFSET(sb_logstart, 48); + XFS_CHECK_SB_OFFSET(sb_rootino, 56); + XFS_CHECK_SB_OFFSET(sb_rbmino, 64); + XFS_CHECK_SB_OFFSET(sb_rsumino, 72); + XFS_CHECK_SB_OFFSET(sb_rextsize, 80); + XFS_CHECK_SB_OFFSET(sb_agblocks, 84); + XFS_CHECK_SB_OFFSET(sb_agcount, 88); + XFS_CHECK_SB_OFFSET(sb_rbmblocks, 92); + XFS_CHECK_SB_OFFSET(sb_logblocks, 96); + XFS_CHECK_SB_OFFSET(sb_versionnum, 100); + XFS_CHECK_SB_OFFSET(sb_sectsize, 102); + XFS_CHECK_SB_OFFSET(sb_inodesize, 104); + XFS_CHECK_SB_OFFSET(sb_inopblock, 106); + XFS_CHECK_SB_OFFSET(sb_blocklog, 120); + XFS_CHECK_SB_OFFSET(sb_fname[12], 120); + XFS_CHECK_SB_OFFSET(sb_sectlog, 121); + XFS_CHECK_SB_OFFSET(sb_inodelog, 122); + XFS_CHECK_SB_OFFSET(sb_inopblog, 123); + XFS_CHECK_SB_OFFSET(sb_agblklog, 124); + XFS_CHECK_SB_OFFSET(sb_rextslog, 125); + XFS_CHECK_SB_OFFSET(sb_inprogress, 126); + XFS_CHECK_SB_OFFSET(sb_imax_pct, 127); + XFS_CHECK_SB_OFFSET(sb_icount, 128); + XFS_CHECK_SB_OFFSET(sb_ifree, 136); + XFS_CHECK_SB_OFFSET(sb_fdblocks, 144); + XFS_CHECK_SB_OFFSET(sb_frextents, 152); + XFS_CHECK_SB_OFFSET(sb_uquotino, 160); + XFS_CHECK_SB_OFFSET(sb_gquotino, 168); + XFS_CHECK_SB_OFFSET(sb_qflags, 176); + XFS_CHECK_SB_OFFSET(sb_flags, 178); + XFS_CHECK_SB_OFFSET(sb_shared_vn, 179); + XFS_CHECK_SB_OFFSET(sb_inoalignmt, 180); + XFS_CHECK_SB_OFFSET(sb_unit, 184); + XFS_CHECK_SB_OFFSET(sb_width, 188); + XFS_CHECK_SB_OFFSET(sb_dirblklog, 192); + XFS_CHECK_SB_OFFSET(sb_logsectlog, 193); + XFS_CHECK_SB_OFFSET(sb_logsectsize, 194); + XFS_CHECK_SB_OFFSET(sb_logsunit, 196); + XFS_CHECK_SB_OFFSET(sb_features2, 200); + XFS_CHECK_SB_OFFSET(sb_bad_features2, 204); + XFS_CHECK_SB_OFFSET(sb_features_compat, 208); + XFS_CHECK_SB_OFFSET(sb_features_ro_compat, 212); + XFS_CHECK_SB_OFFSET(sb_features_incompat, 216); + XFS_CHECK_SB_OFFSET(sb_features_log_incompat, 220); + XFS_CHECK_SB_OFFSET(sb_crc, 224); + XFS_CHECK_SB_OFFSET(sb_spino_align, 228); + XFS_CHECK_SB_OFFSET(sb_pquotino, 232); + XFS_CHECK_SB_OFFSET(sb_lsn, 240); + XFS_CHECK_SB_OFFSET(sb_meta_uuid, 248); + XFS_CHECK_SB_OFFSET(sb_metadirino, 264); + XFS_CHECK_SB_OFFSET(sb_rgcount, 272); + XFS_CHECK_SB_OFFSET(sb_rgextents, 276); + XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); + XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index fb05f44f6c75..763d941a8420 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -143,4 +143,47 @@ time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, __be32 dtimer); __be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); +static inline const char * +xfs_dqinode_path(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return "user"; + case XFS_DQTYPE_GROUP: + return "group"; + case XFS_DQTYPE_PROJ: + return "project"; + } + + ASSERT(0); + return NULL; +} + +static inline enum xfs_metafile_type +xfs_dqinode_metafile_type(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_METAFILE_USRQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_METAFILE_GRPQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_METAFILE_PRJQUOTA; + } + + ASSERT(0); + return XFS_METAFILE_UNKNOWN; +} + +unsigned int xfs_dqinode_sick_mask(xfs_dqtype_t type); + +int xfs_dqinode_load(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dqtype_t type, struct xfs_inode **ipp); +int xfs_dqinode_metadir_create(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode **ipp); +int xfs_dqinode_metadir_link(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode *ip); +int xfs_dqinode_mkdir_parent(struct xfs_mount *mp, struct xfs_inode **dpp); +int xfs_dqinode_load_parent(struct xfs_trans *tp, struct xfs_inode **dpp); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 198b84117df1..cebe83f7842a 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -25,6 +25,9 @@ #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_refcount_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -128,7 +131,7 @@ xfs_refcount_check_irec( struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) return __this_address; if (!xfs_refcount_check_domain(irec)) @@ -138,12 +141,43 @@ xfs_refcount_check_irec( if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) return __this_address; - if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) return __this_address; return NULL; } +xfs_failaddr_t +xfs_rtrefcount_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) + return __this_address; + + if (!xfs_refcount_check_domain(irec)) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount)) + return __this_address; + + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) + return __this_address; + + return NULL; +} + +static inline xfs_failaddr_t +xfs_refcount_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec) +{ + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec); + return xfs_refcount_check_irec(to_perag(cur->bc_group), irec); +} + static inline int xfs_refcount_complain_bad_rec( struct xfs_btree_cur *cur, @@ -152,9 +186,15 @@ xfs_refcount_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, + if (xfs_btree_is_rtrefcount(cur->bc_ops)) { + xfs_warn(mp, + "RT Refcount BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); + } else { + xfs_warn(mp, "Refcount BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); + } xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -180,7 +220,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); + fa = xfs_refcount_check_btrec(cur, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -853,9 +893,9 @@ xfs_refc_merge_refcount( const struct xfs_refcount_irec *irec, enum xfs_refc_adjust_op adjust) { - /* Once a record hits MAXREFCOUNT, it is pinned there forever */ - if (irec->rc_refcount == MAXREFCOUNT) - return MAXREFCOUNT; + /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */ + if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX) + return XFS_REFC_REFCOUNT_MAX; return irec->rc_refcount + adjust; } @@ -898,7 +938,7 @@ xfs_refc_want_merge_center( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount + right->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; *ulenp = ulen; @@ -933,7 +973,7 @@ xfs_refc_want_merge_left( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -967,7 +1007,7 @@ xfs_refc_want_merge_right( * hence we need to catch u32 addition overflows here. */ ulen += cright->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -1065,7 +1105,7 @@ xfs_refcount_still_have_space( */ overhead = xfs_allocfree_block_count(cur->bc_mp, cur->bc_refc.shape_changes); - overhead += cur->bc_mp->m_refc_maxlevels; + overhead += cur->bc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -1085,6 +1125,22 @@ xfs_refcount_still_have_space( cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } +/* Schedule an extent free. */ +static int +xrefc_free_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *rec) +{ + unsigned int flags = 0; + + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + flags |= XFS_FREE_EXTENT_REALTIME; + + return xfs_free_extent_later(cur->bc_tp, + xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock), + rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags); +} + /* * Adjust the refcounts of middle extents. At this point we should have * split extents that crossed the adjustment range; merged with adjacent @@ -1101,7 +1157,6 @@ xfs_refcount_adjust_extents( struct xfs_refcount_irec ext, tmp; int error; int found_rec, found_tmp; - xfs_fsblock_t fsbno; /* Merging did all the work already. */ if (*aglen == 0) @@ -1117,7 +1172,7 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_SHARED; @@ -1154,12 +1209,7 @@ xfs_refcount_adjust_extents( goto out_error; } } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, - tmp.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + error = xrefc_free_extent(cur, &tmp); if (error) goto out_error; } @@ -1197,7 +1247,7 @@ xfs_refcount_adjust_extents( * Adjust the reference count and either update the tree * (incr) or free the blocks (decr). */ - if (ext.rc_refcount == MAXREFCOUNT) + if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX) goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur, &ext); @@ -1217,12 +1267,7 @@ xfs_refcount_adjust_extents( } goto advloop; } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, - ext.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + error = xrefc_free_extent(cur, &ext); if (error) goto out_error; } @@ -1312,7 +1357,7 @@ xfs_refcount_continue_op( xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, ri->ri_blockcount))) { @@ -1320,10 +1365,10 @@ xfs_refcount_continue_op( return -EFSCORRUPTED; } - ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = xfs_agbno_to_fsb(pag, new_agbno); ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + ASSERT(pag_agno(pag) == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1360,7 +1405,7 @@ xfs_refcount_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { + if (rcur != NULL && rcur->bc_group != ri->ri_group) { nr_ops = rcur->bc_refc.nr_ops; shape_changes = rcur->bc_refc.shape_changes; xfs_btree_del_cursor(rcur, 0); @@ -1368,13 +1413,14 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(ri->ri_pag, tp, + struct xfs_perag *pag = to_perag(ri->ri_group); + + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, - ri->ri_pag); + *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); rcur->bc_refc.nr_ops = nr_ops; rcur->bc_refc.shape_changes = shape_changes; } @@ -1418,12 +1464,122 @@ xfs_refcount_finish_one( } /* + * Set up a continuation a deferred rtrefcount operation by updating the + * intent. Checks to make sure we're not going to run off the end of the + * rtgroup. + */ +static inline int +xfs_rtrefcount_continue_op( + struct xfs_btree_cur *cur, + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno, + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno); + + ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount)); + return 0; +} + +/* + * Process one of the deferred realtime refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + */ +int +xfs_rtrefcount_finish_one( + struct xfs_trans *tp, + struct xfs_refcount_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + struct xfs_btree_cur *rcur = *pcur; + int error = 0; + xfs_rgblock_t bno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock); + + trace_xfs_refcount_deferred(mp, ri); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + if (rcur != NULL && rcur->bc_group != ri->ri_group) { + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; + xfs_btree_del_cursor(rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT); + *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg); + + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; + } + + switch (ri->ri_type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + case XFS_REFCOUNT_FREE_COW: + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, ri); + return error; +} + +/* * Record a refcount intent for later processing. */ static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, + bool isrt, xfs_fsblock_t startblock, xfs_extlen_t blockcount) { @@ -1435,6 +1591,7 @@ __xfs_refcount_add( ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + ri->ri_realtime = isrt; xfs_refcount_defer_add(tp, ri); } @@ -1445,12 +1602,13 @@ __xfs_refcount_add( void xfs_refcount_increase_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1460,12 +1618,13 @@ xfs_refcount_increase_extent( void xfs_refcount_decrease_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1667,7 +1826,7 @@ xfs_refcount_adjust_cow_extents( goto out_error; } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_COW; @@ -1821,6 +1980,7 @@ __xfs_refcount_cow_free( void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1829,17 +1989,17 @@ xfs_refcount_alloc_cow_extent( if (!xfs_has_reflink(mp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len); /* Add rmap entry */ - xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ void xfs_refcount_free_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1849,9 +2009,8 @@ xfs_refcount_free_cow_extent( return; /* Remove rmap entry */ - xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); - __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len); } struct xfs_refcount_recovery { @@ -1880,7 +2039,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { xfs_btree_mark_sick(cur); @@ -1895,12 +2054,13 @@ xfs_refcount_recover_extent( /* Find and remove leftover CoW reservations. */ int xfs_refcount_recover_cow_leftovers( - struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_group *xg) { + struct xfs_mount *mp = xg->xg_mount; + bool isrt = xg->xg_type == XG_TYPE_RTG; struct xfs_trans *tp; struct xfs_btree_cur *cur; - struct xfs_buf *agbp; + struct xfs_buf *agbp = NULL; struct xfs_refcount_recovery *rr, *n; struct list_head debris; union xfs_btree_irec low = { @@ -1913,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers( xfs_fsblock_t fsb; int error; - /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + /* reflink filesystems must not have groups larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG); BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); - if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) - return -EOPNOTSUPP; + + if (isrt) { + if (!xfs_has_rtgroups(mp)) + return 0; + if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS) + return -EOPNOTSUPP; + } else { + if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS) + return -EOPNOTSUPP; + } INIT_LIST_HEAD(&debris); @@ -1934,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); - if (error) - goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + if (isrt) { + xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg)); + } else { + error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg)); + } /* Find all the leftover CoW staging extents. */ error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); - xfs_trans_brelse(tp, agbp); + if (agbp) + xfs_trans_brelse(tp, agbp); + else + xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); xfs_trans_cancel(tp); if (error) goto out_free; @@ -1956,15 +2133,15 @@ xfs_refcount_recover_cow_leftovers( goto out_free; /* Free the orphan record */ - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, - rr->rr_rrec.rc_startblock); - xfs_refcount_free_cow_extent(tp, fsb, + fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock); + xfs_refcount_free_cow_extent(tp, isrt, fsb, rr->rr_rrec.rc_blockcount); /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + XFS_AG_RESV_NONE, + isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) goto out_trans; @@ -2029,7 +2206,7 @@ xfs_refcount_query_range_helper( xfs_failaddr_t fa; xfs_refcount_btrec_to_irec(rec, &irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_refcount_check_btrec(cur, &irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 68acb0b1b4a8..f2e299a716a4 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -12,6 +12,7 @@ struct xfs_perag; struct xfs_btree_cur; struct xfs_bmbt_irec; struct xfs_refcount_irec; +struct xfs_rtgroup; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); @@ -56,10 +57,11 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; + bool ri_realtime; }; /* Check that the refcount is appropriate for the record domain. */ @@ -74,24 +76,25 @@ xfs_refcount_check_domain( return true; } -void xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -void xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -extern int xfs_refcount_finish_one(struct xfs_trans *tp, +int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); +int xfs_rtrefcount_finish_one(struct xfs_trans *tp, struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - struct xfs_perag *pag); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg); /* * While we're adjusting the refcounts records of an extent, we have @@ -120,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 795928d1a66d..54505fee1852 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -30,7 +30,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -68,21 +68,20 @@ xfs_refcountbt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, - xfs_refc_block(args.mp))); + xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp))); if (error) goto out_error; if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_ag.pag->pag_agno); + ASSERT(args.agno == cur->bc_group->xg_gno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -170,7 +169,7 @@ xfs_refcountbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -362,11 +361,11 @@ xfs_refcountbt_init_cursor( { struct xfs_btree_cur *cur; - ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); + ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount); cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; cur->bc_ag.agbp = agbp; @@ -515,7 +514,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 6ef4687b3aba..3cdf50563fec 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -25,6 +25,8 @@ #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_rmap_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" struct kmem_cache *xfs_rmap_intent_cache; @@ -213,7 +215,7 @@ xfs_rmap_check_irec( struct xfs_perag *pag, const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); bool is_inode; bool is_unwritten; bool is_bmbt; @@ -264,14 +266,78 @@ xfs_rmap_check_irec( return NULL; } +static xfs_failaddr_t +xfs_rtrmap_check_meta_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (irec->rm_offset != 0) + return __this_address; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + return __this_address; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + if (irec->rm_startblock != 0) + return __this_address; + if (irec->rm_blockcount != mp->m_sb.sb_rextsize) + return __this_address; + return NULL; + case XFS_RMAP_OWN_COW: + if (!xfs_has_rtreflink(mp)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; + return NULL; + default: + return __this_address; + } + + return NULL; +} + +static xfs_failaddr_t +xfs_rtrmap_check_inode_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (!xfs_verify_ino(mp, irec->rm_owner)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount)) + return __this_address; + if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) + return __this_address; + return NULL; +} + +xfs_failaddr_t +xfs_rtrmap_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + if (irec->rm_blockcount == 0) + return __this_address; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK)) + return __this_address; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return xfs_rtrmap_check_meta_irec(rtg, irec); + return xfs_rtrmap_check_inode_irec(rtg, irec); +} + static inline xfs_failaddr_t xfs_rmap_check_btrec( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *irec) { - if (xfs_btree_is_mem_rmap(cur->bc_ops)) - return xfs_rmap_check_irec(cur->bc_mem.pag, irec); - return xfs_rmap_check_irec(cur->bc_ag.pag, irec); + if (xfs_btree_is_rtrmap(cur->bc_ops) || + xfs_btree_is_mem_rtrmap(cur->bc_ops)) + return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec); + return xfs_rmap_check_irec(to_perag(cur->bc_group), irec); } static inline int @@ -285,10 +351,14 @@ xfs_rmap_complain_bad_rec( if (xfs_btree_is_mem_rmap(cur->bc_ops)) xfs_warn(mp, "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa); + else if (xfs_btree_is_rtrmap(cur->bc_ops)) + xfs_warn(mp, + "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); else xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -527,7 +597,7 @@ xfs_rmap_free_check_owner( struct xfs_btree_cur *cur, uint64_t ltoff, struct xfs_rmap_irec *rec, - xfs_filblks_t len, + xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags) @@ -835,7 +905,7 @@ xfs_rmap_hook_enable(void) static inline void xfs_rmap_update_hook( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, enum xfs_rmap_intent_type op, xfs_agblock_t startblock, xfs_extlen_t blockcount, @@ -850,27 +920,27 @@ xfs_rmap_update_hook( .oinfo = *oinfo, /* struct copy */ }; - if (pag) - xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p); + if (xg) + xfs_hooks_call(&xg->xg_rmap_update_hooks, op, &p); } } /* Call the specified function during a reverse mapping update. */ int xfs_rmap_hook_add( - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_rmap_hook *hook) { - return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook); + return xfs_hooks_add(&xg->xg_rmap_update_hooks, &hook->rmap_hook); } /* Stop calling the specified function during a reverse mapping update. */ void xfs_rmap_hook_del( - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_rmap_hook *hook) { - xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook); + xfs_hooks_del(&xg->xg_rmap_update_hooks, &hook->rmap_hook); } /* Configure rmap update hook functions. */ @@ -905,7 +975,8 @@ xfs_rmap_free( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_UNMAP, bno, len, + false, oinfo); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1149,7 +1220,8 @@ xfs_rmap_alloc( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_MAP, bno, len, false, + oinfo); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -2556,6 +2628,47 @@ __xfs_rmap_finish_intent( } } +static int +xfs_rmap_finish_init_cursor( + struct xfs_trans *tp, + struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_perag *pag = to_perag(ri->ri_group); + struct xfs_buf *agbp = NULL; + int error; + + /* + * Refresh the freelist before we start changing the rmapbt, because a + * shape change could cause us to allocate blocks. + */ + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + if (error) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); + return error; + } + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); + return -EFSCORRUPTED; + } + *pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag); + return 0; +} + +static int +xfs_rtrmap_finish_init_cursor( + struct xfs_trans *tp, + struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + *pcur = xfs_rtrmapbt_init_cursor(tp, rtg); + return 0; +} + /* * Process one of the deferred rmap operations. We pass back the * btree cursor to maintain our lock on the rmapbt between calls. @@ -2571,8 +2684,6 @@ xfs_rmap_finish_one( { struct xfs_owner_info oinfo; struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur = *pcur; - struct xfs_buf *agbp = NULL; xfs_agblock_t bno; bool unwritten; int error = 0; @@ -2586,41 +2697,31 @@ xfs_rmap_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - xfs_btree_del_cursor(rcur, 0); - rcur = NULL; + if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) { + xfs_btree_del_cursor(*pcur, 0); *pcur = NULL; } - if (rcur == NULL) { - /* - * Refresh the freelist before we start changing the - * rmapbt, because a shape change could cause us to - * allocate blocks. - */ - error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); - if (error) { - xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); + if (*pcur == NULL) { + if (ri->ri_group->xg_type == XG_TYPE_RTG) + error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur); + else + error = xfs_rmap_finish_init_cursor(tp, ri, pcur); + if (error) return error; - } - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { - xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); - return -EFSCORRUPTED; - } - - *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff); unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno, + bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock, + ri->ri_group->xg_type); + error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno, ri->ri_bmap.br_blockcount, &oinfo, unwritten); if (error) return error; - xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno, + xfs_rmap_update_hook(tp, ri->ri_group, ri->ri_type, bno, ri->ri_bmap.br_blockcount, unwritten, &oinfo); return 0; } @@ -2645,6 +2746,7 @@ __xfs_rmap_add( struct xfs_trans *tp, enum xfs_rmap_intent_type type, uint64_t owner, + bool isrt, int whichfork, struct xfs_bmbt_irec *bmap) { @@ -2656,6 +2758,7 @@ __xfs_rmap_add( ri->ri_owner = owner; ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + ri->ri_realtime = isrt; xfs_rmap_defer_add(tp, ri); } @@ -2669,6 +2772,7 @@ xfs_rmap_map_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; @@ -2676,7 +2780,7 @@ xfs_rmap_map_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_MAP_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2688,6 +2792,7 @@ xfs_rmap_unmap_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; @@ -2695,7 +2800,7 @@ xfs_rmap_unmap_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_UNMAP_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* @@ -2713,6 +2818,7 @@ xfs_rmap_convert_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(mp, whichfork)) return; @@ -2720,15 +2826,15 @@ xfs_rmap_convert_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_CONVERT_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ void xfs_rmap_alloc_extent( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, + bool isrt, + xfs_fsblock_t fsbno, xfs_extlen_t len, uint64_t owner) { @@ -2737,20 +2843,20 @@ xfs_rmap_alloc_extent( if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) return; - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); + bmap.br_startblock = fsbno; bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap); } /* Schedule the deletion of an rmap for non-file data. */ void xfs_rmap_free_extent( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, + bool isrt, + xfs_fsblock_t fsbno, xfs_extlen_t len, uint64_t owner) { @@ -2759,12 +2865,12 @@ xfs_rmap_free_extent( if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) return; - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); + bmap.br_startblock = fsbno; bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap); } /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b783dd4dd95d..5f39f6e53cd1 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -7,6 +7,7 @@ #define __XFS_RMAP_H__ struct xfs_perag; +struct xfs_rtgroup; static inline void xfs_rmap_ino_bmbt_owner( @@ -173,7 +174,8 @@ struct xfs_rmap_intent { int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; + bool ri_realtime; }; /* functions for updating the rmapbt based on bmbt map/unmap operations */ @@ -184,10 +186,10 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); +void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, + xfs_extlen_t len, uint64_t owner); +void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, + xfs_extlen_t len, uint64_t owner); int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur); @@ -206,6 +208,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag, const struct xfs_rmap_irec *irec); +xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec); int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, enum xbtree_recpacking *outcome); @@ -264,8 +268,8 @@ struct xfs_rmap_hook { void xfs_rmap_hook_disable(void); void xfs_rmap_hook_enable(void); -int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook); -void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +int xfs_rmap_hook_add(struct xfs_group *xg, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_del(struct xfs_group *xg, struct xfs_rmap_hook *hook); void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn); #endif diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index ac2f1f499b76..2cab694ac58a 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -57,7 +57,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -66,14 +66,15 @@ xfs_rmapbt_set_root( const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); ASSERT(ptr->s != 0); agf->agf_rmap_root = ptr->s; be32_add_cpu(&agf->agf_rmap_level, inc); - cur->bc_ag.pag->pagf_rmap_level += inc; + pag->pagf_rmap_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -87,7 +88,7 @@ xfs_rmapbt_alloc_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); struct xfs_alloc_arg args = { .len = 1 }; int error; xfs_agblock_t bno; @@ -102,7 +103,7 @@ xfs_rmapbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); + xfs_extent_busy_reuse(pag_group(pag), bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); @@ -125,7 +126,7 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t bno; int error; @@ -136,7 +137,7 @@ xfs_rmapbt_free_block( if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); @@ -227,7 +228,7 @@ xfs_rmapbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_rmap_root; } @@ -538,7 +539,7 @@ xfs_rmapbt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -647,14 +648,13 @@ xfs_rmapbt_mem_cursor( struct xfbtree *xfbt) { struct xfs_btree_cur *cur; - struct xfs_mount *mp = pag->pag_mount; - cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, + cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops, xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); cur->bc_mem.xfbtree = xfbt; cur->bc_nlevels = xfbt->nlevels; - cur->bc_mem.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); return cur; } @@ -863,7 +863,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 27a4472402ba..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -20,28 +20,87 @@ #include "xfs_error.h" #include "xfs_rtbitmap.h" #include "xfs_health.h" +#include "xfs_sb.h" +#include "xfs_errortag.h" +#include "xfs_log.h" +#include "xfs_buf_item.h" +#include "xfs_extent_busy.h" /* * Realtime allocator bitmap functions shared with userspace. */ -/* - * Real time buffers need verifiers to avoid runtime warnings during IO. - * We don't have anything to verify, however, so these are just dummy - * operations. - */ +static xfs_failaddr_t +xfs_rtbuf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (!xfs_verify_magic(bp, hdr->rt_magic)) + return __this_address; + if (!xfs_has_rtgroups(mp)) + return __this_address; + if (!xfs_has_crc(mp)) + return __this_address; + if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp))) + return __this_address; + return NULL; +} + static void xfs_rtbuf_verify_read( - struct xfs_buf *bp) + struct xfs_buf *bp) { + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) { + fa = __this_address; + goto fail; + } + + if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) { + fa = __this_address; + goto fail; + } + + fa = xfs_rtbuf_verify(bp); + if (fa) + goto fail; + return; +fail: + xfs_verifier_error(bp, -EFSCORRUPTED, fa); } static void xfs_rtbuf_verify_write( struct xfs_buf *bp) { - return; + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + fa = xfs_rtbuf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + if (bip) + hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF); } const struct xfs_buf_ops xfs_rtbuf_ops = { @@ -50,6 +109,22 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +const struct xfs_buf_ops xfs_rtbitmap_buf_ops = { + .name = "xfs_rtbitmap", + .magic = { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + +const struct xfs_buf_ops xfs_rtsummary_buf_ops = { + .name = "xfs_rtsummary", + .magic = { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + /* Release cached rt bitmap and summary buffers. */ void xfs_rtbuf_cache_relse( @@ -75,28 +150,31 @@ static int xfs_rtbuf_get( struct xfs_rtalloc_args *args, xfs_fileoff_t block, /* block number in bitmap or summary */ - int issum) /* is summary not bitmap */ + enum xfs_rtg_inodes type) { + struct xfs_inode *ip = args->rtg->rtg_inodes[type]; struct xfs_mount *mp = args->mp; struct xfs_buf **cbpp; /* cached block buffer */ xfs_fileoff_t *coffp; /* cached block number */ struct xfs_buf *bp; /* block buffer, result */ - struct xfs_inode *ip; /* bitmap or summary inode */ struct xfs_bmbt_irec map; - enum xfs_blft type; + enum xfs_blft buf_type; int nmap = 1; int error; - if (issum) { + switch (type) { + case XFS_RTGI_SUMMARY: cbpp = &args->sumbp; coffp = &args->sumoff; - ip = mp->m_rsumip; - type = XFS_BLFT_RTSUMMARY_BUF; - } else { + buf_type = XFS_BLFT_RTSUMMARY_BUF; + break; + case XFS_RTGI_BITMAP: cbpp = &args->rbmbp; coffp = &args->rbmoff; - ip = mp->m_rbmip; - type = XFS_BLFT_RTBITMAP_BUF; + buf_type = XFS_BLFT_RTBITMAP_BUF; + break; + default: + return -EINVAL; } /* @@ -119,22 +197,32 @@ xfs_rtbuf_get( return error; if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) { - xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : - XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, type); return -EFSCORRUPTED; } ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); + mp->m_bsize, 0, &bp, + xfs_rtblock_ops(mp, type)); if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : - XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, type); if (error) return error; - xfs_trans_buf_set_type(args->tp, bp, type); + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) { + xfs_buf_mark_corrupt(bp); + xfs_trans_brelse(args->tp, bp); + xfs_rtginode_mark_sick(args->rtg, type); + return -EFSCORRUPTED; + } + } + + xfs_trans_buf_set_type(args->tp, bp, buf_type); *cbpp = bp; *coffp = block; return 0; @@ -148,11 +236,11 @@ xfs_rtbitmap_read_buf( struct xfs_mount *mp = args->mp; if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) { - xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_BITMAP); return -EFSCORRUPTED; } - return xfs_rtbuf_get(args, block, 0); + return xfs_rtbuf_get(args, block, XFS_RTGI_BITMAP); } int @@ -163,10 +251,10 @@ xfs_rtsummary_read_buf( struct xfs_mount *mp = args->mp; if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) { - xfs_rt_mark_sick(args->mp, XFS_SICK_RT_SUMMARY); + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_SUMMARY); return -EFSCORRUPTED; } - return xfs_rtbuf_get(args, block, 1); + return xfs_rtbuf_get(args, block, XFS_RTGI_SUMMARY); } /* @@ -503,6 +591,7 @@ xfs_rtmodify_summary( { struct xfs_mount *mp = args->mp; xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + uint8_t *rsum_cache = args->rtg->rtg_rsum_cache; unsigned int infoword; xfs_suminfo_t val; int error; @@ -514,11 +603,11 @@ xfs_rtmodify_summary( infoword = xfs_rtsumoffs_to_infoword(mp, so); val = xfs_suminfo_add(args, infoword, delta); - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; + if (rsum_cache) { + if (val == 0 && log + 1 == rsum_cache[bbno]) + rsum_cache[bbno] = log; + if (val != 0 && log >= rsum_cache[bbno]) + rsum_cache[bbno] = log + 1; } xfs_trans_log_rtsummary(args, infoword); @@ -737,7 +826,7 @@ xfs_rtfree_range( /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1, &postblock); if (error) return error; @@ -961,19 +1050,25 @@ xfs_rtcheck_alloc_range( int xfs_rtfree_extent( struct xfs_trans *tp, /* transaction pointer */ + struct xfs_rtgroup *rtg, xfs_rtxnum_t start, /* starting rtext number to free */ xfs_rtxlen_t len) /* length of extent freed */ { struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rbmip = rtg_bitmap(rtg); struct xfs_rtalloc_args args = { .mp = mp, .tp = tp, + .rtg = rtg, }; int error; struct timespec64 atime; - ASSERT(mp->m_rbmip->i_itemp != NULL); - xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); + ASSERT(rbmip->i_itemp != NULL); + xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); if (error) @@ -990,19 +1085,21 @@ xfs_rtfree_extent( * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* * If we've now freed all the blocks, reset the file sequence - * number to 0. + * number to 0 for pre-RTG file systems. */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + if (!xfs_has_rtgroups(mp) && + tp->t_frextents_delta + mp->m_sb.sb_frextents == mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; + if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) + rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - atime = inode_get_atime(VFS_I(mp->m_rbmip)); + atime = inode_get_atime(VFS_I(rbmip)); atime.tv_sec = 0; - inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + inode_set_atime_to_ts(VFS_I(rbmip), atime); + xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE); } error = 0; out: @@ -1018,15 +1115,18 @@ out: int xfs_rtfree_blocks( struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen) { struct xfs_mount *mp = tp->t_mountp; xfs_extlen_t mod; + int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); - mod = xfs_rtb_to_rtxoff(mp, rtlen); + mod = xfs_blen_to_rtxoff(mp, rtlen); if (mod) { ASSERT(mod == 0); return -EIO; @@ -1038,21 +1138,31 @@ xfs_rtfree_blocks( return -EIO; } - return xfs_rtfree_extent(tp, xfs_rtb_to_rtx(mp, rtbno), - xfs_rtb_to_rtx(mp, rtlen)); + error = xfs_rtfree_extent(tp, rtg, xfs_rtb_to_rtx(mp, rtbno), + xfs_extlen_to_rtxlen(mp, rtlen)); + if (error) + return error; + + if (xfs_has_rtgroups(mp)) + xfs_extent_busy_insert(tp, rtg_group(rtg), + xfs_rtb_to_rgbno(mp, rtbno), rtlen, 0); + + return 0; } /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_rtalloc_args args = { + .rtg = rtg, .mp = mp, .tp = tp, }; @@ -1060,10 +1170,13 @@ xfs_rtalloc_query_range( if (start > end) return -EINVAL; - if (start == end || start >= mp->m_sb.sb_rextents) + if (start == end || start >= rtg->rtg_extents) return 0; - end = min(end, mp->m_sb.sb_rextents - 1); + end = min(end, rtg->rtg_extents - 1); + + if (xfs_has_zoned(mp)) + return -EINVAL; /* Iterate the bitmap, looking for discrepancies. */ while (start <= end) { @@ -1086,7 +1199,7 @@ xfs_rtalloc_query_range( rec.ar_startext = start; rec.ar_extcount = rtend - start + 1; - error = fn(mp, tp, &rec, priv); + error = fn(rtg, tp, &rec, priv); if (error) break; } @@ -1101,26 +1214,27 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) { - return xfs_rtalloc_query_range(mp, tp, 0, mp->m_sb.sb_rextents - 1, fn, + return xfs_rtalloc_query_range(rtg, tp, 0, rtg->rtg_extents - 1, fn, priv); } /* Is the given extent all free? */ int xfs_rtalloc_extent_is_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free) { struct xfs_rtalloc_args args = { - .mp = mp, + .mp = rtg_mount(rtg), + .rtg = rtg, .tp = tp, }; xfs_rtxnum_t end; @@ -1136,88 +1250,78 @@ xfs_rtalloc_extent_is_free( return 0; } +/* Compute the number of rt extents tracked by a single bitmap block. */ +xfs_rtxnum_t +xfs_rtbitmap_rtx_per_rbmblock( + struct xfs_mount *mp) +{ + unsigned int rbmblock_bytes = mp->m_sb.sb_blocksize; + + if (xfs_has_rtgroups(mp)) + rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo); + + return rbmblock_bytes * NBBY; +} + /* * Compute the number of rtbitmap blocks needed to track the given number of rt * extents. */ xfs_filblks_t -xfs_rtbitmap_blockcount( +xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { - return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); + if (xfs_has_zoned(mp)) + return 0; + return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } -/* Compute the number of rtsummary blocks needed to track the given rt space. */ -xfs_filblks_t -xfs_rtsummary_blockcount( - struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) +/* How many rt extents does each rtbitmap file track? */ +static inline xfs_rtbxlen_t +xfs_rtbitmap_bitcount( + struct xfs_mount *mp) { - unsigned long long rsumwords; + if (!mp->m_sb.sb_rextents) + return 0; - rsumwords = (unsigned long long)rsumlevels * rbmblocks; - return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); -} + /* rtgroup size can be nonzero even if rextents is zero */ + if (xfs_has_rtgroups(mp)) + return mp->m_sb.sb_rgextents; -/* Lock both realtime free space metadata inodes for a freespace update. */ -void -xfs_rtbitmap_lock( - struct xfs_mount *mp) -{ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + return mp->m_sb.sb_rextents; } /* - * Join both realtime free space metadata inodes to the transaction. The - * ILOCKs will be released on transaction commit. + * Compute the number of rtbitmap blocks used for a given file system. */ -void -xfs_rtbitmap_trans_join( - struct xfs_trans *tp) -{ - xfs_trans_ijoin(tp, tp->t_mountp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tp->t_mountp->m_rsumip, XFS_ILOCK_EXCL); -} - -/* Unlock both realtime free space metadata inodes after a freespace update. */ -void -xfs_rtbitmap_unlock( +xfs_filblks_t +xfs_rtbitmap_blockcount( struct xfs_mount *mp) { - xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + return xfs_rtbitmap_blockcount_len(mp, xfs_rtbitmap_bitcount(mp)); } /* - * Lock the realtime free space metadata inodes for a freespace scan. Callers - * must walk metadata blocks in order of increasing file offset. + * Compute the geometry of the rtsummary file needed to track the given rt + * space. */ -void -xfs_rtbitmap_lock_shared( +xfs_filblks_t +xfs_rtsummary_blockcount( struct xfs_mount *mp, - unsigned int rbmlock_flags) + unsigned int *rsumlevels) { - if (rbmlock_flags & XFS_RBMLOCK_BITMAP) - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - - if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) - xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); -} + xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); + unsigned long long rsumwords; -/* Unlock the realtime free space metadata inodes after a freespace scan. */ -void -xfs_rtbitmap_unlock_shared( - struct xfs_mount *mp, - unsigned int rbmlock_flags) -{ - if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) - xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } - if (rbmlock_flags & XFS_RBMLOCK_BITMAP) - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + *rsumlevels = xfs_compute_rextslog(rextents) + 1; + rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); + return howmany_64(rsumwords, mp->m_blockwsize); } static int @@ -1260,21 +1364,26 @@ out_trans_cancel: /* Get a buffer for the block. */ static int xfs_rtfile_initialize_block( - struct xfs_inode *ip, + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fsblock_t fsbno, void *data) { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *ip = rtg->rtg_inodes[type]; struct xfs_trans *tp; struct xfs_buf *bp; + void *bufdata; const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; enum xfs_blft buf_type; int error; - if (ip == mp->m_rsumip) + if (type == XFS_RTGI_BITMAP) + buf_type = XFS_BLFT_RTBITMAP_BUF; + else if (type == XFS_RTGI_SUMMARY) buf_type = XFS_BLFT_RTSUMMARY_BUF; else - buf_type = XFS_BLFT_RTBITMAP_BUF; + return -EINVAL; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp); if (error) @@ -1288,13 +1397,30 @@ xfs_rtfile_initialize_block( xfs_trans_cancel(tp); return error; } + bufdata = bp->b_addr; xfs_trans_buf_set_type(tp, bp, buf_type); - bp->b_ops = &xfs_rtbuf_ops; + bp->b_ops = xfs_rtblock_ops(mp, type); + + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (type == XFS_RTGI_BITMAP) + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); + else + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(ip->i_ino); + hdr->rt_blkno = cpu_to_be64(XFS_FSB_TO_DADDR(mp, fsbno)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid); + + bufdata += sizeof(*hdr); + } + if (data) - memcpy(bp->b_addr, data, copylen); + memcpy(bufdata, data, copylen); else - memset(bp->b_addr, 0, copylen); + memset(bufdata, 0, copylen); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); return xfs_trans_commit(tp); } @@ -1306,12 +1432,13 @@ xfs_rtfile_initialize_block( */ int xfs_rtfile_initialize_blocks( - struct xfs_inode *ip, /* inode (bitmap/summary) */ + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, /* offset to start from */ xfs_fileoff_t end_fsb, /* offset to allocate to */ void *data) /* data to fill the blocks */ { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = rtg_mount(rtg); const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; while (offset_fsb < end_fsb) { @@ -1319,8 +1446,8 @@ xfs_rtfile_initialize_blocks( xfs_filblks_t i; int error; - error = xfs_rtfile_alloc_blocks(ip, offset_fsb, - end_fsb - offset_fsb, &map); + error = xfs_rtfile_alloc_blocks(rtg->rtg_inodes[type], + offset_fsb, end_fsb - offset_fsb, &map); if (error) return error; @@ -1330,7 +1457,7 @@ xfs_rtfile_initialize_blocks( * Do this one block per transaction, to keep it simple. */ for (i = 0; i < map.br_blockcount; i++) { - error = xfs_rtfile_initialize_block(ip, + error = xfs_rtfile_initialize_block(rtg, type, map.br_startblock + i, data); if (error) return error; @@ -1343,3 +1470,35 @@ xfs_rtfile_initialize_blocks( return 0; } + +int +xfs_rtbitmap_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize; + if (init && !xfs_has_rtgroups(mp)) { + ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + inode_set_atime(VFS_I(ip), 0, 0); + } + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_rtsummary_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_rsumblocks * mp->m_sb.sb_blocksize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 140513d1d6bc..22e5d9cd95f4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -6,7 +6,10 @@ #ifndef __XFS_RTBITMAP_H__ #define __XFS_RTBITMAP_H__ +#include "xfs_rtgroup.h" + struct xfs_rtalloc_args { + struct xfs_rtgroup *rtg; struct xfs_mount *mp; struct xfs_trans *tp; @@ -19,13 +22,37 @@ struct xfs_rtalloc_args { static inline xfs_rtblock_t xfs_rtx_to_rtb( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rtxnum_t rtx) { + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg)); + + if (mp->m_rtxblklog >= 0) + return start + (rtx << mp->m_rtxblklog); + return start + (rtx * mp->m_sb.sb_rextsize); +} + +/* Convert an rgbno into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rgbno_to_rtx( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rgbno >> mp->m_rtxblklog; + return rgbno / mp->m_sb.sb_rextsize; +} + +static inline uint64_t +xfs_rtbxlen_to_blen( + struct xfs_mount *mp, + xfs_rtbxlen_t rtbxlen) +{ if (mp->m_rtxblklog >= 0) - return rtx << mp->m_rtxblklog; + return rtbxlen << mp->m_rtxblklog; - return rtx * mp->m_sb.sb_rextsize; + return rtbxlen * mp->m_sb.sb_rextsize; } static inline xfs_extlen_t @@ -62,66 +89,90 @@ xfs_extlen_to_rtxlen( return len / mp->m_sb.sb_rextsize; } +/* Convert an rt block count into an rt extent count. */ +static inline xfs_rtbxlen_t +xfs_blen_to_rtbxlen( + struct xfs_mount *mp, + uint64_t blen) +{ + if (likely(mp->m_rtxblklog >= 0)) + return blen >> mp->m_rtxblklog; + + return div_u64(blen, mp->m_sb.sb_rextsize); +} + +/* Return the offset of a file block length within an rt extent. */ +static inline xfs_extlen_t +xfs_blen_to_rtxoff( + struct xfs_mount *mp, + xfs_filblks_t blen) +{ + if (likely(mp->m_rtxblklog >= 0)) + return blen & mp->m_rtxblkmask; + + return do_div(blen, mp->m_sb.sb_rextsize); +} + +/* Round this block count up to the nearest rt extent size. */ +static inline xfs_filblks_t +xfs_blen_roundup_rtx( + struct xfs_mount *mp, + xfs_filblks_t blen) +{ + return roundup_64(blen, mp->m_sb.sb_rextsize); +} + /* Convert an rt block number into an rt extent number. */ static inline xfs_rtxnum_t xfs_rtb_to_rtx( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno >> mp->m_rtxblklog; - return div_u64(rtbno, mp->m_sb.sb_rextsize); } +/* Return the offset of a rtgroup block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rgbno_to_rtxoff( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + return rgbno % mp->m_sb.sb_rextsize; +} + /* Return the offset of an rt block number within an rt extent. */ static inline xfs_extlen_t xfs_rtb_to_rtxoff( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno & mp->m_rtxblkmask; - return do_div(rtbno, mp->m_sb.sb_rextsize); } -/* - * Convert an rt block number into an rt extent number, rounding up to the next - * rt extent if the rt block is not aligned to an rt extent boundary. - */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtxup( - struct xfs_mount *mp, - xfs_rtblock_t rtbno) -{ - if (likely(mp->m_rtxblklog >= 0)) { - if (rtbno & mp->m_rtxblkmask) - return (rtbno >> mp->m_rtxblklog) + 1; - return rtbno >> mp->m_rtxblklog; - } - - if (do_div(rtbno, mp->m_sb.sb_rextsize)) - rtbno++; - return rtbno; -} - -/* Round this rtblock up to the nearest rt extent size. */ +/* Round this file block offset up to the nearest rt extent size. */ static inline xfs_rtblock_t -xfs_rtb_roundup_rtx( +xfs_fileoff_roundup_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_fileoff_t off) { - return roundup_64(rtbno, mp->m_sb.sb_rextsize); + return roundup_64(off, mp->m_sb.sb_rextsize); } -/* Round this rtblock down to the nearest rt extent size. */ +/* Round this file block offset down to the nearest rt extent size. */ static inline xfs_rtblock_t -xfs_rtb_rounddown_rtx( +xfs_fileoff_rounddown_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_fileoff_t off) { - return rounddown_64(rtbno, mp->m_sb.sb_rextsize); + return rounddown_64(off, mp->m_sb.sb_rextsize); } /* Convert an rt extent number to a file block offset in the rt bitmap file. */ @@ -130,6 +181,9 @@ xfs_rtx_to_rbmblock( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) + return div_u64(rtx, mp->m_rtx_per_rbmblock); + return rtx >> mp->m_blkbit_log; } @@ -139,6 +193,13 @@ xfs_rtx_to_rbmword( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) { + unsigned int mod; + + div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod); + return mod; + } + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); } @@ -148,6 +209,9 @@ xfs_rbmblock_to_rtx( struct xfs_mount *mp, xfs_fileoff_t rbmoff) { + if (xfs_has_rtgroups(mp)) + return rbmoff * mp->m_rtx_per_rbmblock; + return rbmoff << mp->m_blkbit_log; } @@ -157,7 +221,14 @@ xfs_rbmblock_wordptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_rtword_raw *words = args->rbmbp->b_addr; + struct xfs_mount *mp = args->mp; + union xfs_rtword_raw *words; + struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr; + + if (xfs_has_rtgroups(mp)) + words = (union xfs_rtword_raw *)(hdr + 1); + else + words = args->rbmbp->b_addr; return words + index; } @@ -170,6 +241,8 @@ xfs_rtbitmap_getword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(word->rtg); return word->old; } @@ -182,7 +255,10 @@ xfs_rtbitmap_setword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); - word->old = value; + if (xfs_has_rtgroups(args->mp)) + word->rtg = cpu_to_be32(value); + else + word->old = value; } /* @@ -207,6 +283,9 @@ xfs_rtsumoffs_to_block( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { + if (xfs_has_rtgroups(mp)) + return rsumoff / mp->m_blockwsize; + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); } @@ -221,6 +300,9 @@ xfs_rtsumoffs_to_infoword( { unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + if (xfs_has_rtgroups(mp)) + return rsumoff % mp->m_blockwsize; + return rsumoff & mask; } @@ -230,7 +312,13 @@ xfs_rsumblock_infoptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_suminfo_raw *info = args->sumbp->b_addr; + union xfs_suminfo_raw *info; + struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr; + + if (xfs_has_rtgroups(args->mp)) + info = (union xfs_suminfo_raw *)(hdr + 1); + else + info = args->sumbp->b_addr; return info + index; } @@ -243,6 +331,8 @@ xfs_suminfo_get( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(info->rtg); return info->old; } @@ -255,10 +345,28 @@ xfs_suminfo_add( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) { + be32_add_cpu(&info->rtg, delta); + return be32_to_cpu(info->rtg); + } + info->old += delta; return info->old; } +static inline const struct xfs_buf_ops * +xfs_rtblock_ops( + struct xfs_mount *mp, + enum xfs_rtg_inodes type) +{ + if (xfs_has_rtgroups(mp)) { + if (type == XFS_RTGI_SUMMARY) + return &xfs_rtsummary_buf_ops; + return &xfs_rtbitmap_buf_ops; + } + return &xfs_rtbuf_ops; +} + /* * Functions for walking free space rtextents in the realtime bitmap. */ @@ -268,7 +376,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -291,53 +399,43 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtxnum_t start, xfs_rtxlen_t len, - bool *is_free); -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to free */ - xfs_rtxlen_t len); /* length of extent freed */ - +int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free); +int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_rtxnum_t start, xfs_rtxlen_t len); /* Same as above, but in units of rt blocks. */ -int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, - xfs_filblks_t rtlen); +int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t - rtextents); +xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, + xfs_rtbxlen_t rtextents); xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); - -int xfs_rtfile_initialize_blocks(struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data); + unsigned int *rsumlevels); -void xfs_rtbitmap_lock(struct xfs_mount *mp); -void xfs_rtbitmap_unlock(struct xfs_mount *mp); -void xfs_rtbitmap_trans_join(struct xfs_trans *tp); +int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb, void *data); +int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); -/* Lock the rt bitmap inode in shared mode */ -#define XFS_RBMLOCK_BITMAP (1U << 0) -/* Lock the rt summary inode in shared mode */ -#define XFS_RBMLOCK_SUMMARY (1U << 1) - -void xfs_rtbitmap_lock_shared(struct xfs_mount *mp, - unsigned int rbmlock_flags); -void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp, - unsigned int rbmlock_flags); #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) -# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) + +static inline int xfs_rtfree_blocks(struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + return -ENOSYS; +} # define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) # define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) # define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) @@ -345,17 +443,11 @@ void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp, # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) static inline xfs_filblks_t -xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { /* shut up gcc */ return 0; } -# define xfs_rtsummary_blockcount(mp, l, b) (0) -# define xfs_rtbitmap_lock(mp) do { } while (0) -# define xfs_rtbitmap_trans_join(tp) do { } while (0) -# define xfs_rtbitmap_unlock(mp) do { } while (0) -# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0) -# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0) #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c new file mode 100644 index 000000000000..9186c58e83d5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_health.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_buf_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" + +/* Find the first usable fsblock in this rtgroup. */ +static inline uint32_t +xfs_rtgroup_min_block( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + if (xfs_has_rtsb(mp) && rgno == 0) + return mp->m_sb.sb_rextsize; + + return 0; +} + +/* Precompute this group's geometry */ +void +xfs_rtgroup_calc_geometry( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + rtg->rtg_extents = __xfs_rtgroup_extents(mp, rgno, rgcount, rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + rtg_group(rtg)->xg_min_gbno = xfs_rtgroup_min_block(mp, rgno); +} + +int +xfs_rtgroup_alloc( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL); + if (!rtg) + return -ENOMEM; + + xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents); + + error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG); + if (error) + goto out_free_rtg; + return 0; + +out_free_rtg: + kfree(rtg); + return error; +} + +void +xfs_rtgroup_free( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + xfs_group_free(mp, rgno, XG_TYPE_RTG, NULL); +} + +/* Free a range of incore rtgroup objects. */ +void +xfs_free_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno) +{ + xfs_rgnumber_t rgno; + + for (rgno = first_rgno; rgno < end_rgno; rgno++) + xfs_rtgroup_free(mp, rgno); +} + +/* Initialize some range of incore rtgroup objects. */ +int +xfs_initialize_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + xfs_rgnumber_t index; + int error; + + if (first_rgno >= end_rgno) + return 0; + + for (index = first_rgno; index < end_rgno; index++) { + error = xfs_rtgroup_alloc(mp, index, end_rgno, rextents); + if (error) + goto out_unwind_new_rtgs; + } + + return 0; + +out_unwind_new_rtgs: + xfs_free_rtgroups(mp, first_rgno, index); + return error; +} + +/* Compute the number of rt extents in this realtime group. */ +xfs_rtxnum_t +__xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + ASSERT(rgno < rgcount); + if (rgno == rgcount - 1) + return rextents - ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents); + + ASSERT(xfs_has_rtgroups(mp)); + return mp->m_sb.sb_rgextents; +} + +xfs_rtxnum_t +xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return __xfs_rtgroup_extents(mp, rgno, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); +} + +/* + * Update the rt extent count of the previous tail rtgroup if it changed during + * recovery (i.e. recovery of a growfs). + */ +int +xfs_update_last_rtgroup_size( + struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount) +{ + struct xfs_rtgroup *rtg; + + ASSERT(prev_rgcount > 0); + + rtg = xfs_rtgroup_grab(mp, prev_rgcount - 1); + if (!rtg) + return -EFSCORRUPTED; + rtg->rtg_extents = __xfs_rtgroup_extents(mp, prev_rgcount - 1, + mp->m_sb.sb_rgcount, mp->m_sb.sb_rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + xfs_rtgroup_rele(rtg); + return 0; +} + +/* Lock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_lock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } + } + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL); +} + +/* Unlock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_unlock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } + } +} + +/* + * Join realtime group metadata inodes to the transaction. The ILOCKs will be + * released on transaction commit. + */ +void +xfs_rtgroup_trans_join( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); + + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { + xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); + } + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL); +} + +/* Retrieve rt group geometry. */ +int +xfs_rtgroup_get_geometry( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + /* Fill out form. */ + memset(rgeo, 0, sizeof(*rgeo)); + rgeo->rg_number = rtg_rgno(rtg); + rgeo->rg_length = rtg_blocks(rtg); + xfs_rtgroup_geom_health(rtg, rgeo); + return 0; +} + +#ifdef CONFIG_PROVE_LOCKING +static struct lock_class_key xfs_rtginode_lock_class; + +static int +xfs_rtginode_ilock_cmp_fn( + const struct lockdep_map *m1, + const struct lockdep_map *m2) +{ + const struct xfs_inode *ip1 = + container_of(m1, struct xfs_inode, i_lock.dep_map); + const struct xfs_inode *ip2 = + container_of(m2, struct xfs_inode, i_lock.dep_map); + + if (ip1->i_projid < ip2->i_projid) + return -1; + if (ip1->i_projid > ip2->i_projid) + return 1; + return 0; +} + +static inline void +xfs_rtginode_ilock_print_fn( + const struct lockdep_map *m) +{ + const struct xfs_inode *ip = + container_of(m, struct xfs_inode, i_lock.dep_map); + + printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid, + xfs_metafile_type_str(ip->i_metatype)); +} + +/* + * Most of the time each of the RTG inode locks are only taken one at a time. + * But when committing deferred ops, more than one of a kind can be taken. + * However, deferred rt ops will be committed in rgno order so there is no + * potential for deadlocks. The code here is needed to tell lockdep about this + * order. + */ +static inline void +xfs_rtginode_lockdep_setup( + struct xfs_inode *ip, + xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + lockdep_set_class_and_subclass(&ip->i_lock, &xfs_rtginode_lock_class, + type); + lock_set_cmp_fn(&ip->i_lock, xfs_rtginode_ilock_cmp_fn, + xfs_rtginode_ilock_print_fn); +} +#else +#define xfs_rtginode_lockdep_setup(ip, rgno, type) do { } while (0) +#endif /* CONFIG_PROVE_LOCKING */ + +struct xfs_rtginode_ops { + const char *name; /* short name */ + + enum xfs_metafile_type metafile_type; + + unsigned int sick; /* rtgroup sickness flag */ + + unsigned int fmt_mask; /* all valid data fork formats */ + + /* Does the fs have this feature? */ + bool (*enabled)(const struct xfs_mount *mp); + + /* Create this rtgroup metadata inode and initialize it. */ + int (*create)(struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init); +}; + +static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { + [XFS_RTGI_BITMAP] = { + .name = "bitmap", + .metafile_type = XFS_METAFILE_RTBITMAP, + .sick = XFS_SICK_RG_BITMAP, + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | + (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, + .create = xfs_rtbitmap_create, + }, + [XFS_RTGI_SUMMARY] = { + .name = "summary", + .metafile_type = XFS_METAFILE_RTSUMMARY, + .sick = XFS_SICK_RG_SUMMARY, + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | + (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, + .create = xfs_rtsummary_create, + }, + [XFS_RTGI_RMAP] = { + .name = "rmap", + .metafile_type = XFS_METAFILE_RTRMAP, + .sick = XFS_SICK_RG_RMAPBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* + * growfs must create the rtrmap inodes before adding a + * realtime volume to the filesystem, so we cannot use the + * rtrmapbt predicate here. + */ + .enabled = xfs_has_rmapbt, + .create = xfs_rtrmapbt_create, + }, + [XFS_RTGI_REFCOUNT] = { + .name = "refcount", + .metafile_type = XFS_METAFILE_RTREFCOUNT, + .sick = XFS_SICK_RG_REFCNTBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* same comment about growfs and rmap inodes applies here */ + .enabled = xfs_has_reflink, + .create = xfs_rtrefcountbt_create, + }, +}; + +/* Return the shortname of this rtgroup inode. */ +const char * +xfs_rtginode_name( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].name; +} + +/* Return the metafile type of this rtgroup inode. */ +enum xfs_metafile_type +xfs_rtginode_metafile_type( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].metafile_type; +} + +/* Should this rtgroup inode be present? */ +bool +xfs_rtginode_enabled( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + if (!ops->enabled) + return true; + return ops->enabled(rtg_mount(rtg)); +} + +/* Mark an rtgroup inode sick */ +void +xfs_rtginode_mark_sick( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + xfs_group_mark_sick(rtg_group(rtg), ops->sick); +} + +/* Load and existing rtgroup inode into the rtgroup structure. */ +int +xfs_rtginode_load( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!xfs_has_rtgroups(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_RTGI_BITMAP: + ino = mp->m_sb.sb_rbmino; + break; + case XFS_RTGI_SUMMARY: + ino = mp->m_sb.sb_rsumino; + break; + default: + /* None of the other types exist on !rtgroups */ + return 0; + } + + error = xfs_trans_metafile_iget(tp, ino, ops->metafile_type, + &ip); + } else { + const char *path; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!path) + return -ENOMEM; + error = xfs_metadir_load(tp, mp->m_rtdirip, path, + ops->metafile_type, &ip); + kfree(path); + } + + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_rtginode_mark_sick(rtg, type); + return error; + } + + if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != rtg_rgno(rtg))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + xfs_rtginode_lockdep_setup(ip, rtg_rgno(rtg), type); + rtg->rtg_inodes[type] = ip; + return 0; +} + +/* Release an rtgroup metadata inode. */ +void +xfs_rtginode_irele( + struct xfs_inode **ipp) +{ + if (*ipp) + xfs_irele(*ipp); + *ipp = NULL; +} + +/* Add a metadata inode for a realtime rmap btree. */ +int +xfs_rtginode_create( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + bool init) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_metadir_update upd = { + .dp = mp->m_rtdirip, + .metafile_type = ops->metafile_type, + }; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + upd.path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!upd.path) + return -ENOMEM; + + error = xfs_metadir_start_create(&upd); + if (error) + goto out_path; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + goto out_cancel; + + xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type); + + upd.ip->i_projid = rtg_rgno(rtg); + error = ops->create(rtg, upd.ip, upd.tp, init); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_path; + + kfree(upd.path); + xfs_finish_inode_setup(upd.ip); + rtg->rtg_inodes[type] = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } +out_path: + kfree(upd.path); + return error; +} + +/* Create the parent directory for all rtgroup inodes and load it. */ +int +xfs_rtginode_mkdir_parent( + struct xfs_mount *mp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "rtgroups", &mp->m_rtdirip); +} + +/* Load the parent directory of all rtgroup inodes. */ +int +xfs_rtginode_load_parent( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "rtgroups", + XFS_METAFILE_DIR, &mp->m_rtdirip); +} + +/* Check superblock fields for a read or a write. */ +static xfs_failaddr_t +xfs_rtsb_verify_common( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + + if (!xfs_verify_magic(bp, rsb->rsb_magicnum)) + return __this_address; + if (rsb->rsb_pad) + return __this_address; + + /* Everything to the end of the fs block must be zero */ + if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb))) + return __this_address; + + return NULL; +} + +/* Check superblock fields for a read or revalidation. */ +static inline xfs_failaddr_t +xfs_rtsb_verify_all( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + struct xfs_mount *mp = bp->b_mount; + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) + return fa; + + if (memcmp(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX)) + return __this_address; + if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid)) + return __this_address; + if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return NULL; +} + +static void +xfs_rtsb_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF)) { + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + return; + } + + fa = xfs_rtsb_verify_all(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +static void +xfs_rtsb_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_rtsb_buf_ops = { + .name = "xfs_rtsb", + .magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) }, + .verify_read = xfs_rtsb_read_verify, + .verify_write = xfs_rtsb_write_verify, + .verify_struct = xfs_rtsb_verify_all, +}; + +/* Update a realtime superblock from the primary fs super */ +void +xfs_update_rtsb( + struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp) +{ + const struct xfs_dsb *dsb = sb_bp->b_addr; + struct xfs_rtsb *rsb = rtsb_bp->b_addr; + const uuid_t *meta_uuid; + + rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC); + + rsb->rsb_pad = 0; + memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX); + + memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid)); + + /* + * The metadata uuid is the fs uuid if the metauuid feature is not + * enabled. + */ + if (dsb->sb_features_incompat & + cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID)) + meta_uuid = &dsb->sb_meta_uuid; + else + meta_uuid = &dsb->sb_uuid; + memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid)); +} + +/* + * Update the realtime superblock from a filesystem superblock and log it to + * the given transaction. + */ +struct xfs_buf * +xfs_log_rtsb( + struct xfs_trans *tp, + const struct xfs_buf *sb_bp) +{ + struct xfs_buf *rtsb_bp; + + if (!xfs_has_rtsb(tp->t_mountp)) + return NULL; + + rtsb_bp = xfs_trans_getrtsb(tp); + if (!rtsb_bp) { + /* + * It's possible for the rtgroups feature to be enabled but + * there is no incore rt superblock buffer if the rt geometry + * was specified at mkfs time but the rt section has not yet + * been attached. In this case, rblocks must be zero. + */ + ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0); + return NULL; + } + + xfs_update_rtsb(rtsb_bp, sb_bp); + xfs_trans_ordered_buf(tp, rtsb_bp); + return rtsb_bp; +} diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h new file mode 100644 index 000000000000..d36a6ae0abe5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __LIBXFS_RTGROUP_H +#define __LIBXFS_RTGROUP_H 1 + +#include "xfs_group.h" + +struct xfs_mount; +struct xfs_trans; + +enum xfs_rtg_inodes { + XFS_RTGI_BITMAP, /* allocation bitmap */ + XFS_RTGI_SUMMARY, /* allocation summary */ + XFS_RTGI_RMAP, /* rmap btree inode */ + XFS_RTGI_REFCOUNT, /* refcount btree inode */ + + XFS_RTGI_MAX, +}; + +#ifdef MAX_LOCKDEP_SUBCLASSES +static_assert(XFS_RTGI_MAX <= MAX_LOCKDEP_SUBCLASSES); +#endif + +/* + * Realtime group incore structure, similar to the per-AG structure. + */ +struct xfs_rtgroup { + struct xfs_group rtg_group; + + /* per-rtgroup metadata inodes */ + struct xfs_inode *rtg_inodes[XFS_RTGI_MAX]; + + /* Number of blocks in this group */ + xfs_rtxnum_t rtg_extents; + + /* + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. + * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. + */ + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; +}; + +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + +static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_rtgroup, rtg_group); +} + +static inline struct xfs_group *rtg_group(struct xfs_rtgroup *rtg) +{ + return &rtg->rtg_group; +} + +static inline struct xfs_mount *rtg_mount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_mount; +} + +static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_gno; +} + +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + +static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_BITMAP]; +} + +static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_SUMMARY]; +} + +static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_RMAP]; +} + +static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_REFCOUNT]; +} + +/* Passive rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_get( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_get(mp, rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_hold( + struct xfs_rtgroup *rtg) +{ + return to_rtg(xfs_group_hold(rtg_group(rtg))); +} + +static inline void +xfs_rtgroup_put( + struct xfs_rtgroup *rtg) +{ + xfs_group_put(rtg_group(rtg)); +} + +/* Active rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_grab( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_grab(mp, rgno, XG_TYPE_RTG)); +} + +static inline void +xfs_rtgroup_rele( + struct xfs_rtgroup *rtg) +{ + xfs_group_rele(rtg_group(rtg)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next_range( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t start_rgno, + xfs_rgnumber_t end_rgno) +{ + return to_rtg(xfs_group_next_range(mp, rtg ? rtg_group(rtg) : NULL, + start_rgno, end_rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg) +{ + return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1); +} + +static inline bool +xfs_verify_rgbno( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); + + return xfs_verify_gbno(rtg_group(rtg), rgbno); +} + +/* + * Check that [@rgbno,@len] is a valid extent range in @rtg. + * + * Must only be used for RTG-enabled file systems. + */ +static inline bool +xfs_verify_rgbext( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno, + xfs_extlen_t len) +{ + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); + + return xfs_verify_gbext(rtg_group(rtg), rgbno, len); +} + +static inline xfs_rtblock_t +xfs_rgbno_to_rtb( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + return xfs_gbno_to_fsb(rtg_group(rtg), rgbno); +} + +static inline xfs_rgnumber_t +xfs_rtb_to_rgno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gno(mp, rtbno, XG_TYPE_RTG); +} + +static inline xfs_rgblock_t +xfs_rtb_to_rgbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gbno(mp, rtbno, XG_TYPE_RTG); +} + +/* Is rtbno the start of a RT group? */ +static inline bool +xfs_rtbno_is_group_start( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return (rtbno & mp->m_groups[XG_TYPE_RTG].blkmask) == 0; +} + +/* Convert an rtgroups rt extent number into an rgbno. */ +static inline xfs_rgblock_t +xfs_rtx_to_rgbno( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t rtx) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (likely(mp->m_rtxblklog >= 0)) + return rtx << mp->m_rtxblklog; + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_daddr_t +xfs_rtb_to_daddr( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); +} + +static inline xfs_rtblock_t +xfs_daddr_to_rtb( + struct xfs_mount *mp, + xfs_daddr_t daddr) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; + + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno; + uint32_t rgbno; + + rgno = div_u64_rem(bno, g->blocks, &rgbno); + return ((xfs_rtblock_t)rgno << g->blklog) + rgbno; + } + + return bno; +} + +#ifdef CONFIG_XFS_RT +int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno); + +void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno); +int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, xfs_rtbxlen_t rextents); + +xfs_rtxnum_t __xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno); +void xfs_rtgroup_calc_geometry(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents); + +int xfs_update_last_rtgroup_size(struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount); + +/* Lock the rt bitmap inode in exclusive mode */ +#define XFS_RTGLOCK_BITMAP (1U << 0) +/* Lock the rt bitmap inode in shared mode */ +#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1) +/* Lock the rt rmap inode in exclusive mode */ +#define XFS_RTGLOCK_RMAP (1U << 2) +/* Lock the rt refcount inode in exclusive mode */ +#define XFS_RTGLOCK_REFCOUNT (1U << 3) + +#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_BITMAP_SHARED | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_trans_join(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + unsigned int rtglock_flags); + +int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); + +int xfs_rtginode_mkdir_parent(struct xfs_mount *mp); +int xfs_rtginode_load_parent(struct xfs_trans *tp); + +const char *xfs_rtginode_name(enum xfs_rtg_inodes type); +enum xfs_metafile_type xfs_rtginode_metafile_type(enum xfs_rtg_inodes type); +bool xfs_rtginode_enabled(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +void xfs_rtginode_mark_sick(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +int xfs_rtginode_load(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + struct xfs_trans *tp); +int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + bool init); +void xfs_rtginode_irele(struct xfs_inode **ipp); + +void xfs_rtginode_irele(struct xfs_inode **ipp); + +static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + return kasprintf(GFP_KERNEL, "%u.%s", rgno, xfs_rtginode_name(type)); +} + +void xfs_update_rtsb(struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp); +struct xfs_buf *xfs_log_rtsb(struct xfs_trans *tp, + const struct xfs_buf *sb_bp); +#else +static inline void xfs_free_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno) +{ +} + +static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + return 0; +} + +# define xfs_rtgroup_extents(mp, rgno) (0) +# define xfs_update_last_rtgroup_size(mp, rgno) (0) +# define xfs_rtgroup_lock(rtg, gf) ((void)0) +# define xfs_rtgroup_unlock(rtg, gf) ((void)0) +# define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0) +# define xfs_update_rtsb(bp, sb_bp) ((void)0) +# define xfs_log_rtsb(tp, sb_bp) (NULL) +# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + +#endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c new file mode 100644 index 000000000000..3db5e7a4a945 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_health.h" + +static struct kmem_cache *xfs_rtrefcountbt_cur_cache; + +/* + * Realtime Reference Count btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_refcount_btree.c for more information. + * + * This tree is basically the same as the regular refcount btree except that + * it's rooted in an inode. + */ + +static struct xfs_btree_cur * +xfs_rtrefcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrefcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrefc_mnr[level != 0]; +} + +STATIC int +xfs_rtrefcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrefc_mxr[level != 0]; +} + +/* + * Calculate number of records in a realtime refcount btree inode root. + */ +unsigned int +xfs_rtrefcountbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrefcount_root); + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (2 * sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork so that + * we can resize the in-memory buffer to match it. After a resize to the + * maximum size this function returns the same value as + * xfs_rtrefcountbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrefcountbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrefc_mxr[level != 0]; + return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +STATIC void +xfs_rtrefcountbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_rtrefcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_rtrefcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_rtrefcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC int64_t +xfs_rtrefcountbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; +} + +STATIC int64_t +xfs_rtrefcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +static xfs_failaddr_t +xfs_rtrefcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_reflink(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrefc_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]); +} + +static void +xfs_rtrefcountbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrefcountbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrefcountbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrefcountbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = { + .name = "xfs_rtrefcountbt", + .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) }, + .verify_read = xfs_rtrefcountbt_read_verify, + .verify_write = xfs_rtrefcountbt_write_verify, + .verify_struct = xfs_rtrefcountbt_verify, +}; + +STATIC int +xfs_rtrefcountbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_rtrefcountbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} + +STATIC enum xbtree_key_contig +xfs_rtrefcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + +static inline void +xfs_rtrefcountbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrefcountbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, broot, old_size, + new_size, old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size, + new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrefcount_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrefcountbt_ops = { + .name = "rtrefcount", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2), + .sick_mask = XFS_SICK_RG_REFCNTBT, + + .dup_cursor = xfs_rtrefcountbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrefcountbt_get_minrecs, + .get_maxrecs = xfs_rtrefcountbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, + .key_diff = xfs_rtrefcountbt_key_diff, + .buf_ops = &xfs_rtrefcountbt_buf_ops, + .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, + .keys_inorder = xfs_rtrefcountbt_keys_inorder, + .recs_inorder = xfs_rtrefcountbt_recs_inorder, + .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, + .broot_realloc = xfs_rtrefcountbt_broot_realloc, +}; + +/* Allocate a new rt refcount btree cursor. */ +struct xfs_btree_cur * +xfs_rtrefcountbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_refcount(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops, + mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + cur->bc_ino.whichfork = XFS_DATA_FORK; + return cur; +} + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrefcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a realtime refcount btree block. */ +static inline unsigned int +xfs_rtrefcountbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Calculate number of records in an refcount btree block. + */ +unsigned int +xfs_rtrefcountbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTREFCOUNT_BLOCK_LEN; + return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime refcount btrees. */ +unsigned int +xfs_rtrefcountbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2; + + /* We need at most one record for every block in an rt group. */ + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); +} + +int __init +xfs_rtrefcountbt_init_cur_cache(void) +{ + xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur", + xfs_btree_cur_sizeof( + xfs_rtrefcountbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrefcountbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrefcountbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrefcountbt_cur_cache); + xfs_rtrefcountbt_cur_cache = NULL; +} + +/* Compute the maximum height of a realtime refcount btree. */ +void +xfs_rtrefcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtreflink(mp)) { + mp->m_rtrefc_maxlevels = 0; + return; + } + + /* + * The realtime refcountbt lives on the data device, which means that + * its maximum height is constrained by the size of the data device and + * the height required to store one refcount record for each rtextent + * in an rt group. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr, + mp->m_sb.sb_dblocks); + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr, + mp->m_sb.sb_rgextents); + + /* Add one level to handle the inode root level. */ + mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrefcount btree size for some records. */ +unsigned long long +xfs_rtrefcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrefc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +static unsigned long long +xfs_rtrefcountbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrefc_mxr[0] == 0) + return 0; + + return xfs_rtrefcountbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + * We need enough space to hold one record for every rt extent in the rtgroup. + */ +xfs_filblks_t +xfs_rtrefcountbt_calc_reserves( + struct xfs_mount *mp) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents); +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +STATIC void +xfs_rtrefcountbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrefcount_root *dblock, + int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + unsigned int rblocklen; + + rblocklen = xfs_rtrefcount_broot_space(mp, dblock); + + xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + tkp = xfs_rtrefcount_key_addr(rblock, 1); + fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + trp = xfs_rtrefcount_rec_addr(rblock, 1); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reference count btree root in from disk. */ +int +xfs_iformat_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrefcount inodes before adding a realtime + * volume to the filesystem, so we cannot use the rtrefcount predicate + * here. + */ + if (!xfs_has_reflink(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrefc_maxlevels || + xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrefcount_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_rtrefcountbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + struct xfs_rtrefcount_root *dblock, + int dblocklen) +{ + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int maxrecs; + unsigned int numrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_key_addr(rblock, 1); + tkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_rec_addr(rblock, 1); + trp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reference count btree root out to disk. */ +void +xfs_iflush_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot, + ifp->if_broot_bytes, dfp, + XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime refcount btree inode. + */ +int +xfs_rtrefcountbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, + xfs_rtrefcount_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h new file mode 100644 index 000000000000..a99b7a8aec86 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTREFCOUNT_BTREE_H__ +#define __XFS_RTREFCOUNT_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; + +/* refcounts only exist on crc enabled filesystems */ +#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp, + unsigned int blocklen, bool leaf); +void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrefcountbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_refcount_rec * +xfs_rtrefcount_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void); +int __init xfs_rtrefcountbt_init_cur_cache(void); +void xfs_rtrefcountbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp); +unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */ + +static inline struct xfs_refcount_rec * +xfs_rtrefcount_droot_rec_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_droot_key_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_droot_ptr_addr( + struct xfs_rtrefcount_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrefcount_ptr_addr(bb, index, + xfs_rtrefcountbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrefcount_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTREFCOUNT_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb) +{ + return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrefcount_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrefcount_root); + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrefcount_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp, + struct xfs_btree_block *rblock, int rblocklen, + struct xfs_rtrefcount_root *dblock, int dblocklen); +void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); + +#endif /* __XFS_RTREFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c new file mode 100644 index 000000000000..9bdc2cbfc113 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -0,0 +1,1054 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_metafile.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_bmap.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" + +static struct kmem_cache *xfs_rtrmapbt_cur_cache; + +/* + * Realtime Reverse Map btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_rmap_btree.c for more information. + * + * This tree is basically the same as the regular rmap btree except that it + * is rooted in an inode and does not live in free space. + */ + +static struct xfs_btree_cur * +xfs_rtrmapbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrmapbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrmap_mnr[level != 0]; +} + +STATIC int +xfs_rtrmapbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrmap_mxr[level != 0]; +} + +/* Calculate number of records in the ondisk realtime rmap btree inode root. */ +unsigned int +xfs_rtrmapbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrmap_root); + + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_rtrmapbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrmapbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrmap_mxr[level != 0]; + return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +/* + * Convert the ondisk record's offset field into the ondisk key's offset field. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) +{ + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); +} + +STATIC void +xfs_rtrmapbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->rmap.rm_startblock = rec->rmap.rm_startblock; + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); +} + +STATIC void +xfs_rtrmapbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + uint64_t off; + int adj; + + adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; + + key->rmap.rm_startblock = rec->rmap.rm_startblock; + be32_add_cpu(&key->rmap.rm_startblock, adj); + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); + if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || + XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) + return; + off = be64_to_cpu(key->rmap.rm_offset); + off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); + key->rmap.rm_offset = cpu_to_be64(off); +} + +STATIC void +xfs_rtrmapbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); + rec->rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); +} + +STATIC void +xfs_rtrmapbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +/* + * Mask the appropriate parts of the ondisk key field for a key comparison. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline uint64_t offset_keymask(uint64_t offset) +{ + return offset & ~XFS_RMAP_OFF_UNWRITTEN; +} + +STATIC int64_t +xfs_rtrmapbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + const struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + int64_t d; + + d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + if (d) + return d; + + x = be64_to_cpu(kp->rm_owner); + y = rec->rm_owner; + if (x > y) + return 1; + else if (y > x) + return -1; + + x = offset_keymask(be64_to_cpu(kp->rm_offset)); + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +STATIC int64_t +xfs_rtrmapbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct xfs_rmap_key *kp1 = &k1->rmap; + const struct xfs_rmap_key *kp2 = &k2->rmap; + int64_t d; + __u64 x, y; + + /* Doesn't make sense to mask off the physical space part */ + ASSERT(!mask || mask->rmap.rm_startblock); + + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - + be32_to_cpu(kp2->rm_startblock); + if (d) + return d; + + if (!mask || mask->rmap.rm_owner) { + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + if (!mask || mask->rmap.rm_offset) { + /* Doesn't make sense to allow offset but not owner */ + ASSERT(!mask || mask->rmap.rm_owner); + + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + return 0; +} + +static xfs_failaddr_t +xfs_rtrmapbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_rmapbt(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrmap_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]); +} + +static void +xfs_rtrmapbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrmapbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrmapbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrmapbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = { + .name = "xfs_rtrmapbt", + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, + .verify_read = xfs_rtrmapbt_read_verify, + .verify_write = xfs_rtrmapbt_write_verify, + .verify_struct = xfs_rtrmapbt_verify, +}; + +STATIC int +xfs_rtrmapbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; + + x = be32_to_cpu(k1->rmap.rm_startblock); + y = be32_to_cpu(k2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(k1->rmap.rm_owner); + b = be64_to_cpu(k2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC int +xfs_rtrmapbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; + + x = be32_to_cpu(r1->rmap.rm_startblock); + y = be32_to_cpu(r2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(r1->rmap.rm_owner); + b = be64_to_cpu(r2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC enum xbtree_key_contig +xfs_rtrmapbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->rmap.rm_startblock); + + /* + * We only support checking contiguity of the physical space component. + * If any callers ever need more specificity than that, they'll have to + * implement it here. + */ + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); + + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), + be32_to_cpu(key2->rmap.rm_startblock)); +} + +static inline void +xfs_rtrmapbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrmapbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size, + old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, + new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrmap_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrmapbt_ops = { + .name = "rtrmap", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_OVERLAPPING | + XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2), + .sick_mask = XFS_SICK_RG_RMAPBT, + + .dup_cursor = xfs_rtrmapbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrmapbt_get_minrecs, + .get_maxrecs = xfs_rtrmapbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrmapbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, + .key_diff = xfs_rtrmapbt_key_diff, + .buf_ops = &xfs_rtrmapbt_buf_ops, + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .keys_inorder = xfs_rtrmapbt_keys_inorder, + .recs_inorder = xfs_rtrmapbt_recs_inorder, + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, + .broot_realloc = xfs_rtrmapbt_broot_realloc, +}; + +/* Allocate a new rt rmap btree cursor. */ +struct xfs_btree_cur * +xfs_rtrmapbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_rmap(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops, + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_ino.whichfork = XFS_DATA_FORK; + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + + return cur; +} + +#ifdef CONFIG_XFS_BTREE_IN_MEM +/* + * Validate an in-memory realtime rmap btree block. Callers are allowed to + * generate an in-memory btree even if the ondisk feature is not enabled. + */ +static xfs_failaddr_t +xfs_rtrmapbt_mem_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (xfs_has_rmapbt(mp)) { + if (level >= mp->m_rtrmap_maxlevels) + return __this_address; + } else { + if (level >= xfs_rtrmapbt_maxlevels_ondisk()) + return __this_address; + } + + maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +xfs_rtrmapbt_mem_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = xfs_rtrmapbt_mem_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = { + .name = "xfs_rtrmapbt_mem", + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, + .verify_read = xfs_rtrmapbt_mem_rw_verify, + .verify_write = xfs_rtrmapbt_mem_rw_verify, + .verify_struct = xfs_rtrmapbt_mem_verify, +}; + +const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = { + .type = XFS_BTREE_TYPE_MEM, + .geom_flags = XFS_BTGEO_OVERLAPPING, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = xfs_rtrmapbt_key_diff, + .buf_ops = &xfs_rtrmapbt_mem_buf_ops, + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .keys_inorder = xfs_rtrmapbt_keys_inorder, + .recs_inorder = xfs_rtrmapbt_recs_inorder, + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +xfs_rtrmapbt_mem_cursor( + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, + struct xfbtree *xfbt) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops, + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); + cur->bc_mem.xfbtree = xfbt; + cur->bc_nlevels = xfbt->nlevels; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + return cur; +} + +/* Create an in-memory realtime rmap btree. */ +int +xfs_rtrmapbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + xfs_rgnumber_t rgno) +{ + xfbt->owner = rgno; + return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops); +} +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a rt reverse mapping btree block. */ +static inline unsigned int +xfs_rtrmapbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Calculate number of records in an rt reverse mapping btree block. + */ +unsigned int +xfs_rtrmapbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTRMAP_BLOCK_LEN; + return xfs_rtrmapbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime reverse mapping btrees. */ +unsigned int +xfs_rtrmapbt_maxlevels_ondisk(void) +{ + unsigned long long max_dblocks; + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2; + + /* + * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs. + * + * On a reflink filesystem, each block in an rtgroup can have up to + * 2^32 (per the refcount record format) owners, which means that + * theoretically we could face up to 2^64 rmap records. However, we're + * likely to run out of blocks in the data device long before that + * happens, which means that we must compute the max height based on + * what the btree will look like if it consumes almost all the blocks + * in the data device due to maximal sharing factor. + */ + max_dblocks = -1U; /* max ag count */ + max_dblocks *= XFS_MAX_CRC_AG_BLOCKS; + return xfs_btree_space_to_height(minrecs, max_dblocks); +} + +int __init +xfs_rtrmapbt_init_cur_cache(void) +{ + xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur", + xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrmapbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrmapbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrmapbt_cur_cache); + xfs_rtrmapbt_cur_cache = NULL; +} + +/* Compute the maximum height of an rt reverse mapping btree. */ +void +xfs_rtrmapbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtrmapbt(mp)) { + mp->m_rtrmap_maxlevels = 0; + return; + } + + /* + * The realtime rmapbt lives on the data device, which means that its + * maximum height is constrained by the size of the data device and + * the height required to store one rmap record for each block in an + * rt group. + * + * On a reflink filesystem, each rt block can have up to 2^32 (per the + * refcount record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. This makes the computation of + * maxlevels based on record count meaningless, so we only consider the + * size of the data device. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr, + mp->m_sb.sb_dblocks); + if (xfs_has_rtreflink(mp)) { + mp->m_rtrmap_maxlevels = d_maxlevels + 1; + return; + } + + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr, + mp->m_groups[XG_TYPE_RTG].blocks); + + /* Add one level to handle the inode root level. */ + mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrmap btree size for some records. */ +unsigned long long +xfs_rtrmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrmap_mnr, len); +} + +/* + * Calculate the maximum rmap btree size. + */ +static unsigned long long +xfs_rtrmapbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrmap_mxr[0] == 0) + return 0; + + return xfs_rtrmapbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +xfs_filblks_t +xfs_rtrmapbt_calc_reserves( + struct xfs_mount *mp) +{ + uint32_t blocks = mp->m_groups[XG_TYPE_RTG].blocks; + + if (!xfs_has_rtrmapbt(mp)) + return 0; + + /* Reserve 1% of the rtgroup or enough for 1 block per record. */ + return max_t(xfs_filblks_t, blocks / 100, + xfs_rtrmapbt_max_size(mp, blocks)); +} + +/* Convert on-disk form of btree root to in-memory form. */ +STATIC void +xfs_rtrmapbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrmap_root *dblock, + unsigned int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rmap_key *fkp; + __be64 *fpp; + struct xfs_rmap_key *tkp; + __be64 *tpp; + struct xfs_rmap_rec *frp; + struct xfs_rmap_rec *trp; + unsigned int rblocklen = xfs_rtrmap_broot_space(mp, dblock); + unsigned int numrecs; + unsigned int maxrecs; + + xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + numrecs = be16_to_cpu(dblock->bb_numrecs); + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrmap_droot_key_addr(dblock, 1); + tkp = xfs_rtrmap_key_addr(rblock, 1); + fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrmap_droot_rec_addr(dblock, 1); + trp = xfs_rtrmap_rec_addr(rblock, 1); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reverse mapping btree root in from disk. */ +int +xfs_iformat_rtrmap( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrmap inodes before adding a realtime volume + * to the filesystem, so we cannot use the rtrmapbt predicate here. + */ + if (!xfs_has_rmapbt(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrmap_maxlevels || + xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrmap_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* Convert in-memory form of btree root to on-disk form. */ +void +xfs_rtrmapbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + unsigned int rblocklen, + struct xfs_rtrmap_root *dblock, + unsigned int dblocklen) +{ + struct xfs_rmap_key *fkp; + __be64 *fpp; + struct xfs_rmap_key *tkp; + __be64 *tpp; + struct xfs_rmap_rec *frp; + struct xfs_rmap_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + numrecs = be16_to_cpu(rblock->bb_numrecs); + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrmap_key_addr(rblock, 1); + tkp = xfs_rtrmap_droot_key_addr(dblock, 1); + fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrmap_rec_addr(rblock, 1); + trp = xfs_rtrmap_droot_rec_addr(dblock, 1); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reverse mapping btree root out to disk. */ +void +xfs_iflush_rtrmap( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes, + dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime rmap btree inode. + */ +int +xfs_rtrmapbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + + return 0; +} + +/* + * Initialize an rmap for a realtime superblock using the potentially updated + * rt geometry in the provided @mp. + */ +int +xfs_rtrmapbt_init_rtsb( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + struct xfs_trans *tp) +{ + struct xfs_rmap_irec rmap = { + .rm_blockcount = mp->m_sb.sb_rextsize, + .rm_owner = XFS_RMAP_OWN_FS, + }; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_has_rtsb(mp)); + ASSERT(rtg_rgno(rtg) == 0); + + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_map_raw(cur, &rmap); + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h new file mode 100644 index 000000000000..e328fd62a149 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTRMAP_BTREE_H__ +#define __XFS_RTRMAP_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; +struct xfbtree; + +/* rmaps only exist on crc enabled filesystems */ +#define XFS_RTRMAP_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); +void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrmapbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_rmap_rec * +xfs_rtrmap_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_rec *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_rmap_rec)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_high_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + sizeof(struct xfs_rmap_key) + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrmap_ptr_t *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + maxrecs * 2 * sizeof(struct xfs_rmap_key) + + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); +} + +unsigned int xfs_rtrmapbt_maxlevels_ondisk(void); + +int __init xfs_rtrmapbt_init_cur_cache(void); +void xfs_rtrmapbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp); + +/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */ + +static inline struct xfs_rmap_rec * +xfs_rtrmap_droot_rec_addr( + struct xfs_rtrmap_root *block, + unsigned int index) +{ + return (struct xfs_rmap_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_rmap_rec)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_droot_key_addr( + struct xfs_rtrmap_root *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)(block + 1) + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_droot_ptr_addr( + struct xfs_rtrmap_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrmap_ptr_t *) + ((char *)(block + 1) + + maxrecs * 2 * sizeof(struct xfs_rmap_key) + + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrmap_ptr_addr(bb, index, + xfs_rtrmapbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrmap_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTRMAP_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); + return sz + nrecs * sizeof(struct xfs_rmap_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb) +{ + return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrmap_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrmap_root); + + if (level > 0) + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); + return sz + nrecs * sizeof(struct xfs_rmap_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrmap_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock, + unsigned int rblocklen, struct xfs_rtrmap_root *dblock, + unsigned int dblocklen); +void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + struct xfs_trans *tp); + +unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp, xfs_rgnumber_t rgno); + +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + +#endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d95409f3cba6..711e180f9ebb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -27,6 +27,9 @@ #include "xfs_ag.h" #include "xfs_rtbitmap.h" #include "xfs_exchrange.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -180,6 +183,10 @@ xfs_sb_version_to_features( features |= XFS_FEAT_EXCHANGE_RANGE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) features |= XFS_FEAT_PARENT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) + features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -232,11 +239,40 @@ xfs_validate_sb_read( return 0; } +/* Return the number of extents covered by a single rt bitmap file */ +static xfs_rtbxlen_t +xfs_extents_per_rbm( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_rgextents; + return sbp->sb_rextents; +} + +/* + * Return the payload size of a single rt bitmap block (without the metadata + * header if any). + */ +static inline unsigned int +xfs_rtbmblock_size( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo); + return sbp->sb_blocksize; +} + static uint64_t -xfs_sb_calc_rbmblocks( +xfs_expected_rbmblocks( struct xfs_sb *sbp) { - return howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; + return howmany_64(xfs_extents_per_rbm(sbp), + NBBY * xfs_rtbmblock_size(sbp)); } /* Validate the realtime geometry */ @@ -244,9 +280,15 @@ bool xfs_validate_rt_geometry( struct xfs_sb *sbp) { - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) - return false; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || @@ -258,7 +300,7 @@ xfs_validate_rt_geometry( if (sbp->sb_rextents == 0 || sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) || sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) || - sbp->sb_rbmblocks != xfs_sb_calc_rbmblocks(sbp)) + sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp)) return false; return true; @@ -297,13 +339,6 @@ xfs_validate_sb_write( * the kernel cannot support since we checked for unsupported bits in * the read verifier, which means that memory is corrupt. */ - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"Corruption detected in superblock compatible features (0x%x)!", - (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); - return -EFSCORRUPTED; - } - if (!xfs_is_readonly(mp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, @@ -339,6 +374,106 @@ xfs_validate_sb_write( return 0; } +int +xfs_compute_rgblklog( + xfs_rtxlen_t rgextents, + xfs_rgblock_t rextsize) +{ + uint64_t rgblocks = (uint64_t)rgextents * rextsize; + + return xfs_highbit64(rgblocks - 1) + 1; +} + +static int +xfs_validate_sb_rtgroups( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + uint64_t groups; + int rgblklog; + + if (sbp->sb_rextsize == 0) { + xfs_warn(mp, +"Realtime extent size must not be zero."); + return -EINVAL; + } + + if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) { + xfs_warn(mp, +"Realtime group size (%u) must be less than %u rt extents.", + sbp->sb_rgextents, + XFS_MAX_RGBLOCKS / sbp->sb_rextsize); + return -EINVAL; + } + + if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) { + xfs_warn(mp, +"Realtime group size (%u) must be at least %u rt extents.", + sbp->sb_rgextents, XFS_MIN_RGEXTENTS); + return -EINVAL; + } + + if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) { + xfs_warn(mp, +"Realtime groups (%u) must be less than %u.", + sbp->sb_rgcount, XFS_MAX_RGNUMBER); + return -EINVAL; + } + + groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents); + if (groups != sbp->sb_rgcount) { + xfs_warn(mp, +"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.", + sbp->sb_rgcount, groups); + return -EINVAL; + } + + /* Exchange-range is required for fsr to work on realtime files */ + if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) { + xfs_warn(mp, +"Realtime groups feature requires exchange-range support."); + return -EINVAL; + } + + rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize); + if (sbp->sb_rgblklog != rgblklog) { + xfs_warn(mp, +"Realtime group log (%d) does not match expected value (%d).", + sbp->sb_rgblklog, rgblklog); + return -EINVAL; + } + + return 0; +} + +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -350,6 +485,7 @@ xfs_validate_sb_common( uint32_t agcount = 0; uint32_t rem; bool has_dalign; + int error; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, @@ -398,6 +534,38 @@ xfs_validate_sb_common( sbp->sb_inoalignmt, align); return -EINVAL; } + + if (sbp->sb_spino_align && + (sbp->sb_spino_align > sbp->sb_inoalignmt || + (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) { + xfs_warn(mp, +"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).", + sbp->sb_spino_align, + sbp->sb_inoalignmt); + return -EINVAL; + } + } else if (sbp->sb_spino_align) { + xfs_warn(mp, + "Sparse inode alignment (%u) should be zero.", + sbp->sb_spino_align); + return -EINVAL; + } + + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) { + xfs_warn(mp, +"Metadir superblock padding fields must be zero."); + return -EINVAL; + } + + error = xfs_validate_sb_rtgroups(mp, sbp); + if (error) + return error; + } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { @@ -566,6 +734,14 @@ xfs_validate_sb_common( void xfs_sb_quota_from_disk(struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + sbp->sb_uquotino = NULLFSINO; + sbp->sb_gquotino = NULLFSINO; + sbp->sb_pquotino = NULLFSINO; + return; + } + /* * older mkfs doesn't initialize quota inodes to NULLFSINO. This * leads to in-core values having two different values for a quota @@ -689,6 +865,28 @@ __xfs_sb_from_disk( /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = be64_to_cpu(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad)); + to->sb_rgcount = be32_to_cpu(from->sb_rgcount); + to->sb_rgextents = be32_to_cpu(from->sb_rgextents); + to->sb_rbmino = NULLFSINO; + to->sb_rsumino = NULLFSINO; + } else { + to->sb_metadirino = NULLFSINO; + to->sb_rgcount = 1; + to->sb_rgextents = 0; + } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -706,6 +904,15 @@ xfs_sb_quota_to_disk( { uint16_t qflags = from->sb_qflags; + if (xfs_sb_is_v5(from) && + (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_uquotino = cpu_to_be64(0); + to->sb_gquotino = cpu_to_be64(0); + to->sb_pquotino = cpu_to_be64(0); + return; + } + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); /* @@ -836,6 +1043,21 @@ xfs_sb_to_disk( to->sb_lsn = cpu_to_be64(from->sb_lsn); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = cpu_to_be64(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memset(to->sb_pad, 0, sizeof(to->sb_pad)); + to->sb_rgcount = cpu_to_be32(from->sb_rgcount); + to->sb_rgextents = cpu_to_be32(from->sb_rgextents); + to->sb_rbmino = cpu_to_be64(0); + to->sb_rsumino = cpu_to_be64(0); + } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -965,13 +1187,47 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .verify_write = xfs_sb_write_verify, }; +/* Compute cached rt geometry from the incore sb. */ void -xfs_mount_sb_set_rextsize( +xfs_sb_mount_rextsize( struct xfs_mount *mp, struct xfs_sb *sbp) { + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; + rgs->blklog = mp->m_sb.sb_rgblklog; + rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; + } else { + rgs->blocks = 0; + rgs->blklog = 0; + rgs->blkmask = (uint64_t)-1; + } +} + +/* Update incore sb rt extent size, then recompute the cached rt geometry. */ +void +xfs_mount_sb_set_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp, + xfs_agblock_t rextsize) +{ + sbp->sb_rextsize = rextsize; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, + rextsize); + + xfs_sb_mount_rextsize(mp, sbp); } /* @@ -988,6 +1244,8 @@ xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + mp->m_agfrotor = 0; atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; @@ -996,9 +1254,14 @@ xfs_sb_mount_common( mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - xfs_mount_sb_set_rextsize(mp, sbp); + mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG; + mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG; + + ags->blocks = mp->m_sb.sb_agblocks; + ags->blklog = mp->m_sb.sb_agblklog; + ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog); + + xfs_sb_mount_rextsize(mp, sbp); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false); @@ -1015,11 +1278,23 @@ xfs_sb_mount_common( mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false); + mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2; + mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2; + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2; + mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); @@ -1045,19 +1320,24 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. - * - * Do not update sb_frextents here because it is not part of the lazy - * sb counters, despite having a percpu counter. It is always kept - * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() - * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = - percpu_counter_sum_positive(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); + } + + /* + * sb_frextents was added to the lazy sb counters when the rt groups + * feature was introduced. This counter can go negative due to the way + * we handle nearly-lockless reservations, so we must use the _positive + * variant here to avoid writing out nonsense frextents. + */ + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { + mp->m_sb.sb_frextents = + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); @@ -1109,18 +1389,17 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno = 1; + struct xfs_perag *pag = NULL; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for_each_perag_from(mp, agno, pag) { + while ((pag = xfs_perag_next_from(mp, pag, 1))) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -1132,7 +1411,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - pag->pag_agno); + pag_agno(pag)); if (!saved_error) saved_error = error; continue; @@ -1146,26 +1425,22 @@ xfs_update_secondary_sbs( xfs_buf_relse(bp); /* don't hold too many buffers at once */ - if (agno % 16) + if (pag_agno(pag) % 16) continue; error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, pag->pag_agno); + error, pag_agno(pag)); if (!saved_error) saved_error = error; continue; } } error = xfs_buf_delwri_submit(&buffer_list); - if (error) { - xfs_warn(mp, - "write error %d updating a secondary superblock near ag %d", - error, agno); - } - + if (error) + xfs_warn(mp, "error %d writing secondary superblocks", error); return saved_error ? saved_error : error; } @@ -1175,10 +1450,12 @@ xfs_update_secondary_sbs( */ int xfs_sync_sb_buf( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool update_rtsb) { struct xfs_trans *tp; struct xfs_buf *bp; + struct xfs_buf *rtsb_bp = NULL; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); @@ -1188,6 +1465,11 @@ xfs_sync_sb_buf( bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); + if (update_rtsb) { + rtsb_bp = xfs_log_rtsb(tp, bp); + if (rtsb_bp) + xfs_trans_bhold(tp, rtsb_bp); + } xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -1196,7 +1478,11 @@ xfs_sync_sb_buf( * write out the sb buffer to get the changes to disk */ error = xfs_bwrite(bp); + if (!error && rtsb_bp) + error = xfs_bwrite(rtsb_bp); out: + if (rtsb_bp) + xfs_buf_relse(rtsb_bp); xfs_buf_relse(bp); return error; } @@ -1283,6 +1569,10 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; if (xfs_has_exchange_range(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; + if (xfs_has_metadir(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1298,6 +1588,15 @@ xfs_fs_geometry( return; geo->version = XFS_FSOP_GEOM_VERSION_V5; + + if (xfs_has_rtgroups(mp)) { + geo->rgcount = sbp->sb_rgcount; + geo->rgextents = sbp->sb_rgextents; + } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 885c83755991..34d0dd374e9b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -15,10 +15,11 @@ struct xfs_perag; extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); -extern int xfs_sync_sb_buf(struct xfs_mount *mp); +extern int xfs_sync_sb_buf(struct xfs_mount *mp, bool update_rtsb); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_sb_mount_rextsize(struct xfs_mount *mp, struct xfs_sb *sbp); void xfs_mount_sb_set_rextsize(struct xfs_mount *mp, - struct xfs_sb *sbp); + struct xfs_sb *sbp, xfs_agblock_t rextsize); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); @@ -43,5 +44,6 @@ bool xfs_validate_stripe_geometry(struct xfs_mount *mp, bool xfs_validate_rt_geometry(struct xfs_sb *sbp); uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); +int xfs_compute_rgblklog(xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 33b84a3a83ff..b1e0d9bc1f7d 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -38,7 +38,12 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops; +extern const struct xfs_buf_ops xfs_rtsummary_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; +extern const struct xfs_buf_ops xfs_rtsb_buf_ops; +extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; @@ -52,6 +57,9 @@ extern const struct xfs_btree_ops xfs_bmbt_ops; extern const struct xfs_btree_ops xfs_refcountbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrmapbt_ops; +extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrefcountbt_ops; static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) { @@ -93,10 +101,26 @@ static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops) { return ops == &xfs_rmapbt_mem_ops; } + +static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrmapbt_mem_ops; +} #else # define xfs_btree_is_mem_rmap(...) (false) +# define xfs_btree_is_mem_rtrmap(...) (false) #endif +static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrmapbt_ops; +} + +static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrefcountbt_ops; +} + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -157,6 +181,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_SB_RBLOCKS 0x00000800 #define XFS_TRANS_SB_REXTENTS 0x00001000 #define XFS_TRANS_SB_REXTSLOG 0x00002000 +#define XFS_TRANS_SB_RGCOUNT 0x00004000 /* * Here we centralize the specification of XFS meta-data buffer reference count diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f228127a88ff..fb47a76ead18 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -92,8 +92,10 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; + /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) - return __this_address; + return NULL; + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 3c40f37e82c7..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_ACCESS) inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 1a7f95bcf069..86a111d0f2fc 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,6 +22,12 @@ #include "xfs_rtbitmap.h" #include "xfs_attr_item.h" #include "xfs_log.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -92,6 +98,14 @@ xfs_refcountbt_block_count( return num_ops * (2 * mp->m_refc_maxlevels - 1); } +static unsigned int +xfs_rtrefcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_rtrefc_maxlevels - 1); +} + /* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into @@ -213,7 +227,9 @@ xfs_calc_inode_chunk_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime - * extents, as well as the realtime summary block. + * extents, as well as the realtime summary block (t1). Realtime rmap btree + * operations happen in a second transaction, so factor in a couple of rtrmapbt + * splits (t2). */ static unsigned int xfs_rtalloc_block_count( @@ -222,10 +238,16 @@ xfs_rtalloc_block_count( { unsigned int rtbmp_blocks; xfs_rtxlen_t rtxlen; + unsigned int t1, t2 = 0; rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); - return (rtbmp_blocks + 1) * num_ops; + rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen); + t1 = (rtbmp_blocks + 1) * num_ops; + + if (xfs_has_rmapbt(mp)) + t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1); + + return max(t1, t2); } /* @@ -248,26 +270,64 @@ xfs_rtalloc_block_count( */ /* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. * - * This is calculated as: + * This is calculated as the max of: * Data device refcount updates (t1): * the agfs of the ags containing the blocks: nr_ops * sector size * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size */ static unsigned int xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int t1, t2; - if (!xfs_has_reflink(mp)) - return 0; + t1 = xfs_calc_finish_cui_reservation(mp, nr_ops); + t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops); - return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); + return max(t1, t2); } /* @@ -353,6 +413,96 @@ xfs_calc_write_reservation_minlogsize( } /* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rt_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rtrmapbt(mp)) + return 0; + return xfs_calc_finish_rt_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + +/* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size @@ -384,16 +534,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -474,9 +616,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 3); if (xfs_has_parent(mp)) { unsigned int rename_overhead, exchange_overhead; @@ -584,9 +724,7 @@ xfs_calc_link_reservation( overhead += xfs_calc_iunlink_remove_reservation(mp); t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 1); if (xfs_has_parent(mp)) { t3 = resp->tr_attrsetm.tr_logres; @@ -649,9 +787,7 @@ xfs_calc_remove_reservation( t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 2); if (xfs_has_parent(mp)) { t3 = resp->tr_attrrm.tr_logres; @@ -1154,6 +1290,15 @@ xfs_calc_namespace_reservations( resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; } +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + void xfs_trans_resv_calc( struct xfs_mount *mp, @@ -1248,4 +1393,167 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); +} + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2..336279e0fc61 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ @@ -98,8 +99,32 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 1155ff2d37e2..d89b570aafcc 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -14,6 +14,19 @@ #define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \ (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0])) +/* Worst case number of realtime rmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) \ + (((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0])) + +/* Adding one realtime rmap could split every level to the top of the tree. */ +#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels) + +/* Blocks we might need to add "b" realtime rmaps to a tree. */ +#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \ + ((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \ + XFS_RTRMAPADD_SPACE_RES(mp)) + /* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index c299b16c9365..1faf04204c5d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -12,6 +12,8 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* @@ -111,7 +113,7 @@ xfs_verify_ino( /* Is this an internal inode number? */ inline bool -xfs_internal_inum( +xfs_is_sb_inum( struct xfs_mount *mp, xfs_ino_t ino) { @@ -129,24 +131,42 @@ xfs_verify_dir_ino( struct xfs_mount *mp, xfs_ino_t ino) { - if (xfs_internal_inum(mp, ino)) + if (xfs_is_sb_inum(mp, ino)) return false; return xfs_verify_ino(mp, ino); } /* - * Verify that an realtime block number pointer doesn't point off the - * end of the realtime device. + * Verify that a realtime block number pointer neither points outside the + * allocatable areas of the rtgroup nor off the end of the realtime + * device. */ inline bool xfs_verify_rtbno( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + if (xfs_has_rtgroups(mp)) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + xfs_rtxnum_t rtx = xfs_rtb_to_rtx(mp, rtbno); + + if (rgno >= mp->m_sb.sb_rgcount) + return false; + if (rtx >= xfs_rtgroup_extents(mp, rgno)) + return false; + if (xfs_has_rtsb(mp) && rgno == 0 && rtx == 0) + return false; + return true; + } + return rtbno < mp->m_sb.sb_rblocks; } -/* Verify that a realtime device extent is fully contained inside the volume. */ +/* + * Verify that an allocated realtime device extent neither points outside + * allocatable areas of the rtgroup, across an rtgroup boundary, nor off the + * end of the realtime device. + */ bool xfs_verify_rtbext( struct xfs_mount *mp, @@ -159,7 +179,14 @@ xfs_verify_rtbext( if (!xfs_verify_rtbno(mp, rtbno)) return false; - return xfs_verify_rtbno(mp, rtbno + len - 1); + if (!xfs_verify_rtbno(mp, rtbno + len - 1)) + return false; + + if (xfs_has_rtgroups(mp) && + xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1)) + return false; + + return true; } /* Calculate the range of valid icount values. */ @@ -170,13 +197,12 @@ xfs_icount_range( unsigned long long *max) { unsigned long long nr_inos = 0; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index a8cd44d03ef6..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -9,10 +9,12 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ +typedef uint32_t xfs_rgnumber_t; /* realtime group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ @@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t; #define NULLFILEOFF ((xfs_fileoff_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLRGBLOCK ((xfs_rgblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLRGNUMBER ((xfs_rgnumber_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -198,6 +202,13 @@ enum xfs_ag_resv_type { * altering fdblocks. If you think you need this you're wrong. */ XFS_AG_RESV_IGNORE, + + /* + * This allocation activity is being done on behalf of a metadata file. + * These files maintain their own permanent space reservations and are + * required to adjust fdblocks using the xfs_metafile_resv_* helpers. + */ + XFS_AG_RESV_METAFILE, }; /* Results of scanning a btree keyspace to check occupancy. */ @@ -212,6 +223,44 @@ enum xbtree_recpacking { XBTREE_RECPACKING_FULL, }; +enum xfs_group_type { + XG_TYPE_AG, + XG_TYPE_RTG, + XG_TYPE_MAX, +} __packed; + +#define XG_TYPE_STRINGS \ + { XG_TYPE_AG, "ag" }, \ + { XG_TYPE_RTG, "rtg" } + +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ @@ -222,7 +271,7 @@ bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); -bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_is_sb_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index f8e5b67128d2..303374df44bd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -60,6 +60,34 @@ xchk_superblock_xref( } /* + * Calculate the ondisk superblock size in bytes given the feature set of the + * mounted filesystem (aka the primary sb). This is subtlely different from + * the logic in xfs_repair, which computes the size of a secondary sb given the + * featureset listed in the secondary sb. + */ +STATIC size_t +xchk_superblock_ondisk_size( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + return offsetofend(struct xfs_dsb, sb_rtreserved); + if (xfs_has_metadir(mp)) + return offsetofend(struct xfs_dsb, sb_pad); + if (xfs_has_metauuid(mp)) + return offsetofend(struct xfs_dsb, sb_meta_uuid); + if (xfs_has_crc(mp)) + return offsetofend(struct xfs_dsb, sb_lsn); + if (xfs_sb_version_hasmorebits(&mp->m_sb)) + return offsetofend(struct xfs_dsb, sb_bad_features2); + if (xfs_has_logv2(mp)) + return offsetofend(struct xfs_dsb, sb_logsunit); + if (xfs_has_sector(mp)) + return offsetofend(struct xfs_dsb, sb_logsectsize); + /* only support dirv2 or more recent */ + return offsetofend(struct xfs_dsb, sb_dirblklog); +} + +/* * Scrub the filesystem superblock. * * Note: We do /not/ attempt to check AG 0's superblock. Mount is @@ -75,6 +103,7 @@ xchk_superblock( struct xfs_buf *bp; struct xfs_dsb *sb; struct xfs_perag *pag; + size_t sblen; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -144,11 +173,19 @@ xchk_superblock( if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino)) xchk_block_set_preen(sc, bp); - if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(sc->mp)) { + if (sb->sb_rbmino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); - if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_rsumino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { + if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) + xchk_block_set_preen(sc, bp); + } if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize)) xchk_block_set_corrupt(sc, bp); @@ -224,11 +261,19 @@ xchk_superblock( * sb_icount, sb_ifree, sb_fdblocks, sb_frexents */ - if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(mp)) { + if (sb->sb_uquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); - if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_gquotino != cpu_to_be64(0)) + xchk_block_set_preen(sc, bp); + } else { + if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) + xchk_block_set_preen(sc, bp); + } /* * Skip the quota flags since repair will force quotacheck. @@ -337,8 +382,13 @@ xchk_superblock( if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) xchk_block_set_corrupt(sc, bp); - if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(mp)) { + if (sb->sb_pquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { + if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) + xchk_block_set_preen(sc, bp); + } /* Don't care about sb_lsn */ } @@ -349,9 +399,26 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); } + if (xfs_has_metadir(mp)) { + if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog) + xchk_block_set_corrupt(sc, bp); + + if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad))) + xchk_block_set_corrupt(sc, bp); + } + /* Everything else must be zero. */ - if (memchr_inv(sb + 1, 0, - BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + sblen = xchk_superblock_ondisk_size(mp); + if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen)) xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); @@ -434,7 +501,7 @@ xchk_agf_xref_btreeblks( { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; - xfs_agblock_t blocks; + xfs_filblks_t blocks; xfs_agblock_t btreeblks; int error; @@ -483,7 +550,7 @@ xchk_agf_xref_refcblks( struct xfs_scrub *sc) { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; if (!sc->sa.refc_cur) @@ -552,7 +619,7 @@ xchk_agf( /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); - if (eoag != pag->block_count) + if (eoag != pag_group(pag)->xg_block_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ @@ -816,7 +883,7 @@ xchk_agi_xref_fiblocks( struct xfs_scrub *sc) { struct xfs_agi *agi = sc->sa.agi_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error = 0; if (!xfs_has_inobtcounts(sc->mp)) @@ -932,7 +999,7 @@ xchk_agi( /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); - if (eoag != pag->block_count) + if (eoag != pag_group(pag)->xg_block_count) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check btree roots and levels */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 2f98d90d7fd6..cd6f0223879f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -208,8 +208,8 @@ xrep_agf_init_header( memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(pag->pag_agno); - agf->agf_length = cpu_to_be32(pag->block_count); + agf->agf_seqno = cpu_to_be32(pag_agno(pag)); + agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -256,7 +256,7 @@ xrep_agf_calc_from_btrees( struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; /* Update the AGF counters from the bnobt. */ @@ -384,7 +384,7 @@ xrep_agf( * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); if (error) @@ -647,7 +647,7 @@ xrep_agfl_fill( xfs_agblock_t agbno = start; int error; - trace_xrep_agfl_insert(sc->sa.pag, agbno, len); + trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len); while (agbno < start + len && af->fl_off < af->flcount) af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); @@ -687,7 +687,7 @@ xrep_agfl_init_header( agfl = XFS_BUF_TO_AGFL(agfl_bp); memset(agfl, 0xFF, BBTOB(agfl_bp->b_length)); agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag)); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); /* @@ -741,7 +741,7 @@ xrep_agfl( * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL); if (error) @@ -897,8 +897,8 @@ xrep_agi_init_header( memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(pag->pag_agno); - agi->agi_length = cpu_to_be32(pag->block_count); + agi->agi_seqno = cpu_to_be32(pag_agno(pag)); + agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) @@ -946,7 +946,7 @@ xrep_agi_calc_from_btrees( if (error) goto err; if (xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; error = xfs_btree_count_blocks(cur, &blocks); if (error) @@ -959,7 +959,7 @@ xrep_agi_calc_from_btrees( agi->agi_freecount = cpu_to_be32(freecount); if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); @@ -1038,12 +1038,10 @@ xrep_iunlink_reload_next( { struct xfs_scrub *sc = ragi->sc; struct xfs_inode *ip; - xfs_ino_t ino; xfs_agino_t ret = NULLAGINO; int error; - ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino); - error = xchk_iget(ragi->sc, ino, &ip); + error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip); if (error) return ret; @@ -1114,9 +1112,9 @@ xrep_iunlink_igrab( struct xfs_perag *pag, struct xfs_inode *ip) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) return false; if (!xfs_inode_on_unlinked_list(ip)) @@ -1140,7 +1138,7 @@ xrep_iunlink_visit( unsigned int bucket; int error; - ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno); + ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag)); ASSERT(xfs_inode_on_unlinked_list(ip)); agino = XFS_INO_TO_AGINO(mp, ip->i_ino); @@ -1171,7 +1169,7 @@ xrep_iunlink_mark_incore( struct xrep_agi *ragi) { struct xfs_perag *pag = ragi->sc->sa.pag; - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); uint32_t first_index = 0; bool done = false; unsigned int nr_found = 0; @@ -1211,7 +1209,7 @@ xrep_iunlink_mark_incore( * us to see this inode, so another lookup from the * same index will not find it again. */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) @@ -1278,9 +1276,7 @@ xrep_iunlink_mark_ondisk_rec( * on because we haven't actually scrubbed the inobt or the * inodes yet. */ - error = xchk_iget(ragi->sc, - XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno, - agino), + error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip); if (error) continue; @@ -1539,15 +1535,13 @@ xrep_iunlink_relink_next( ip = xfs_iunlink_lookup(pag, agino); if (!ip) { - xfs_ino_t ino; xfs_agino_t prev_agino; /* * No inode exists in cache. Load it off the disk so that we * can reinsert it into the incore unlinked list. */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); - error = xchk_iget(sc, ino, &ip); + error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip); if (error) return -EFSCORRUPTED; @@ -1601,15 +1595,13 @@ xrep_iunlink_relink_prev( ip = xfs_iunlink_lookup(pag, agino); if (!ip) { - xfs_ino_t ino; xfs_agino_t next_agino; /* * No inode exists in cache. Load it off the disk so that we * can reinsert it into the incore unlinked list. */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); - error = xchk_iget(sc, ino, &ip); + error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip); if (error) return -EFSCORRUPTED; @@ -1769,7 +1761,7 @@ xrep_agi( * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL); if (error) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index d1b8a4997dd2..8b282138097f 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -139,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_alloc_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index 30295898cc8a..bed6a09aa791 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -132,17 +132,16 @@ int xrep_setup_ag_allocbt( struct xfs_scrub *sc) { + struct xfs_group *xg = pag_group(sc->sa.pag); unsigned int busy_gen; /* * Make sure the busy extent list is clear because we can't put extents * on there twice. */ - busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); - if (xfs_extent_busy_list_empty(sc->sa.pag)) + if (xfs_extent_busy_list_empty(xg, &busy_gen)) return 0; - - return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); + return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); } /* Check for any obvious conflicts in the free extent. */ @@ -210,7 +209,7 @@ xrep_abt_stash( if (error) return error; - trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + trace_xrep_abt_found(sc->sa.pag, &arec); error = xfarray_append(ra->free_records, &arec); if (error) @@ -484,8 +483,8 @@ xrep_abt_reserve_space( ASSERT(arec.ar_blockcount <= UINT_MAX); len = min_t(unsigned int, arec.ar_blockcount, desired); - trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, - arec.ar_startblock, len, XFS_RMAP_OWN_AG); + trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock, + len, XFS_RMAP_OWN_AG); error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, arec.ar_startblock, len); @@ -543,8 +542,9 @@ xrep_abt_dispose_one( /* Add a deferred rmap for each extent we used. */ if (resv->used > 0) - xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, - resv->used, XFS_RMAP_OWN_AG); + xfs_rmap_alloc_extent(sc->tp, false, + xfs_agbno_to_fsb(pag, resv->agbno), resv->used, + XFS_RMAP_OWN_AG); /* * For each reserved btree block we didn't use, add it to the free @@ -554,8 +554,8 @@ xrep_abt_dispose_one( if (free_aglen == 0) return 0; - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, - free_aglen, ra->new_bnobt.oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + ra->new_bnobt.oinfo.oi_owner); error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); @@ -849,6 +849,7 @@ xrep_allocbt( { struct xrep_abt *ra; struct xfs_mount *mp = sc->mp; + unsigned int busy_gen; char *descr; int error; @@ -869,7 +870,7 @@ xrep_allocbt( * on there twice. In theory we cleared this before we started, but * let's not risk the filesystem. */ - if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) { error = -EDEADLOCK; goto out_ra; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 5ab2ac53c920..4f1e2574660d 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,7 +19,10 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_rtgroup.h" #include "xfs_health.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" @@ -142,15 +145,22 @@ static inline bool xchk_bmap_get_rmap( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec, - xfs_agblock_t agbno, + xfs_agblock_t bno, uint64_t owner, struct xfs_rmap_irec *rmap) { + struct xfs_btree_cur **curp = &info->sc->sa.rmap_cur; xfs_fileoff_t offset; unsigned int rflags = 0; int has_rmap; int error; + if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork)) + curp = &info->sc->sr.rmap_cur; + + if (*curp == NULL) + return false; + if (info->whichfork == XFS_ATTR_FORK) rflags |= XFS_RMAP_ATTR_FORK; if (irec->br_state == XFS_EXT_UNWRITTEN) @@ -171,13 +181,13 @@ xchk_bmap_get_rmap( * range rmap lookup to make sure we get the correct owner/offset. */ if (info->is_shared) { - error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, - owner, offset, rflags, rmap, &has_rmap); + error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset, + rflags, rmap, &has_rmap); } else { - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, - owner, offset, rflags, rmap, &has_rmap); + error = xfs_rmap_lookup_le(*curp, bno, owner, offset, + rflags, rmap, &has_rmap); } - if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) + if (!xchk_should_check_xref(info->sc, &error, curp)) return false; if (!has_rmap) @@ -191,29 +201,29 @@ STATIC void xchk_bmap_xref_rmap( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec, - xfs_agblock_t agbno) + xfs_agblock_t bno) { struct xfs_rmap_irec rmap; unsigned long long rmap_end; uint64_t owner = info->sc->ip->i_ino; - if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + if (xchk_skip_xref(info->sc->sm)) return; /* Find the rmap record for this irec. */ - if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap)) return; /* * The rmap must be an exact match for this incore file mapping record, * which may have arisen from multiple ondisk records. */ - if (rmap.rm_startblock != agbno) + if (rmap.rm_startblock != bno) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap_end != agbno + irec->br_blockcount) + if (rmap_end != bno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -258,7 +268,7 @@ STATIC void xchk_bmap_xref_rmap_cow( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec, - xfs_agblock_t agbno) + xfs_agblock_t bno) { struct xfs_rmap_irec rmap; unsigned long long rmap_end; @@ -268,7 +278,7 @@ xchk_bmap_xref_rmap_cow( return; /* Find the rmap record for this irec. */ - if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap)) return; /* @@ -276,12 +286,12 @@ xchk_bmap_xref_rmap_cow( * can start before and end after the physical space allocated to this * mapping. There are no offsets to check. */ - if (rmap.rm_startblock > agbno) + if (rmap.rm_startblock > bno) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap_end < agbno + irec->br_blockcount) + if (rmap_end < bno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -314,8 +324,58 @@ xchk_bmap_rt_iextent_xref( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { + struct xfs_owner_info oinfo; + xfs_rgblock_t rgbno; + int error; + + error = xchk_rtgroup_init_existing(info->sc, + xfs_rtb_to_rgno(ip->i_mount, irec->br_startblock), + &info->sc->sr); + if (!xchk_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + return; + + error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL); + if (!xchk_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + goto out_free; + xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, irec->br_blockcount); + + if (!xfs_has_rtrmapbt(info->sc->mp)) + goto out_cur; + + rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock); + + switch (info->whichfork) { + case XFS_DATA_FORK: + xchk_bmap_xref_rmap(info, irec, rgbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_rt_shared(info->sc, rgbno, + irec->br_blockcount); + } + xchk_xref_is_not_rt_cow_staging(info->sc, rgbno, + irec->br_blockcount); + break; + case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, rgbno); + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, + irec->br_blockcount, &XFS_RMAP_OINFO_COW); + xchk_xref_is_rt_cow_staging(info->sc, rgbno, + irec->br_blockcount); + xchk_xref_is_not_rt_shared(info->sc, rgbno, + irec->br_blockcount); + break; + } +out_cur: + xchk_rtgroup_btcur_free(&info->sc->sr); +out_free: + xchk_rtgroup_free(info->sc, &info->sc->sr); } /* Cross-reference a single datadev extent record. */ @@ -600,9 +660,8 @@ xchk_bmap_check_rmap( if (irec.br_startoff != check_rec.rm_offset) xchk_fblock_set_corrupt(sc, sbcri->whichfork, check_rec.rm_offset); - if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.pag->pag_agno, - check_rec.rm_startblock)) + if (irec.br_startblock != + xfs_gbno_to_fsb(cur->bc_group, check_rec.rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, check_rec.rm_offset); if (irec.br_blockcount > check_rec.rm_blockcount) @@ -656,6 +715,30 @@ xchk_bmap_check_ag_rmaps( return error; } +/* Make sure each rt rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_rt_rmaps( + struct xfs_scrub *sc, + struct xfs_rtgroup *rtg) +{ + struct xchk_bmap_check_rmap_info sbcri; + struct xfs_btree_cur *cur; + int error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); + + sbcri.sc = sc; + sbcri.whichfork = XFS_DATA_FORK; + error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); + if (error == -ECANCELED) + error = 0; + + xfs_btree_del_cursor(cur, error); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + return error; +} + /* * Decide if we want to scan the reverse mappings to determine if the attr * fork /really/ has zero space mappings. @@ -710,10 +793,6 @@ xchk_bmap_check_empty_datafork( { struct xfs_ifork *ifp = &ip->i_df; - /* Don't support realtime rmap checks yet. */ - if (XFS_IS_REALTIME_INODE(ip)) - return false; - /* * If the dinode repair found a bad data fork, it will reset the fork * to extents format with zero records and wait for the this scrubber @@ -761,11 +840,25 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error; - for_each_perag(sc->mp, agno, pag) { + if (xfs_ifork_is_realtime(sc->ip, whichfork)) { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { + error = xchk_bmap_check_rt_rmaps(sc, rtg); + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + return 0; + } + + while ((pag = xfs_perag_next(sc->mp, pag))) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { @@ -822,9 +915,12 @@ xchk_bmap_iext_mapping( /* Are these two mappings contiguous with each other? */ static inline bool xchk_are_bmaps_contiguous( + const struct xchk_bmap_info *info, const struct xfs_bmbt_irec *b1, const struct xfs_bmbt_irec *b2) { + struct xfs_mount *mp = info->sc->mp; + /* Don't try to combine unallocated mappings. */ if (!xfs_bmap_is_real_extent(b1)) return false; @@ -838,6 +934,17 @@ xchk_are_bmaps_contiguous( return false; if (b1->br_state != b2->br_state) return false; + + /* + * Don't combine bmaps that would cross rtgroup boundaries. This is a + * valid state, but if combined they will fail rtb extent checks. + */ + if (info->is_rt && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, b1->br_startblock) != + xfs_rtb_to_rgno(mp, b2->br_startblock)) + return false; + } + return true; } @@ -875,7 +982,7 @@ xchk_bmap_iext_iter( * that we just read, if possible. */ while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { - if (!xchk_are_bmaps_contiguous(irec, &got)) + if (!xchk_are_bmaps_contiguous(info, irec, &got)) break; if (!xchk_bmap_iext_mapping(info, &got)) { @@ -931,8 +1038,8 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink filesystems. */ - if (!xfs_has_reflink(mp)) { + /* No CoW forks filesystem doesn't support out of place writes */ + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } @@ -957,6 +1064,7 @@ xchk_bmap( case XFS_DINODE_FMT_UUID: case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_META_BTREE: /* No mappings to check. */ if (whichfork == XFS_COW_FORK) xchk_fblock_set_corrupt(sc, whichfork, 0); diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index 4505f4829d53..1084213b8e9b 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -25,11 +25,13 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" #include "xfs_refcount.h" #include "xfs_quota.h" #include "xfs_ialloc.h" #include "xfs_ag.h" #include "xfs_reflink.h" +#include "xfs_rtgroup.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -99,14 +101,21 @@ xrep_bmap_discover_shared( xfs_filblks_t blockcount) { struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *cur; xfs_agblock_t agbno; xfs_agblock_t fbno; xfs_extlen_t flen; int error; - agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); - error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, - &fbno, &flen, false); + if (XFS_IS_REALTIME_INODE(sc->ip)) { + agbno = xfs_rtb_to_rgbno(sc->mp, startblock); + cur = sc->sr.refc_cur; + } else { + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + cur = sc->sa.refc_cur; + } + error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen, + false); if (error) return error; @@ -196,7 +205,7 @@ xrep_bmap_check_fork_rmap( return -EFSCORRUPTED; /* Check that this is within the AG. */ - if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + if (!xfs_verify_agbext(to_perag(cur->bc_group), rec->rm_startblock, rec->rm_blockcount)) return -EFSCORRUPTED; @@ -237,7 +246,6 @@ xrep_bmap_walk_rmap( void *priv) { struct xrep_bmap *rb = priv; - struct xfs_mount *mp = cur->bc_mp; xfs_fsblock_t fsbno; int error = 0; @@ -269,8 +277,7 @@ xrep_bmap_walk_rmap( if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) return -EFSCORRUPTED; - fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, - rec->rm_startblock); + fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), rec->rm_startblock); if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { rb->old_bmbt_block_count += rec->rm_blockcount; @@ -361,6 +368,114 @@ xrep_bmap_scan_ag( return error; } +#ifdef CONFIG_XFS_RT +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_rtfork_rmap( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + /* xattr extents are never stored on realtime devices */ + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + return -EFSCORRUPTED; + + /* bmbt blocks are never stored on realtime devices */ + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + return -EFSCORRUPTED; + + /* Data extents for non-rt files are never stored on the rt device. */ + if (!XFS_IS_REALTIME_INODE(sc->ip)) + return -EFSCORRUPTED; + + /* Check the file offsets and physical extents. */ + if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check that this is within the rtgroup. */ + if (!xfs_verify_rgbext(to_rtg(cur->bc_group), rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + return xrep_require_rtext_inuse(sc, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record realtime extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rtrmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + return xrep_bmap_from_rmap(rb, rec->rm_offset, + xfs_rgbno_to_rtb(to_rtg(cur->bc_group), + rec->rm_startblock), + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* Scan the realtime reverse mappings to build the new extent map. */ +STATIC int +xrep_bmap_scan_rtgroup( + struct xrep_bmap *rb, + struct xfs_rtgroup *rtg) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + if (!xfs_has_rtrmapbt(sc->mp)) + return 0; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, + XFS_RTGLOCK_RMAP | + XFS_RTGLOCK_REFCOUNT | + XFS_RTGLOCK_BITMAP_SHARED); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb); + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); + return error; +} +#else +static inline int +xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg) +{ + return -EFSCORRUPTED; +} +#endif + /* Find the delalloc extents from the old incore extent tree. */ STATIC int xrep_bmap_find_delalloc( @@ -409,12 +524,27 @@ xrep_bmap_find_mappings( struct xrep_bmap *rb) { struct xfs_scrub *sc = rb->sc; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error = 0; + /* + * Iterate the rtrmaps for extents. Metadata files never have content + * on the realtime device, so there's no need to scan them. + */ + if (!xfs_is_metadir_inode(sc->ip)) { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { + error = xrep_bmap_scan_rtgroup(rb, rtg); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + } + /* Iterate the rmaps for extents. */ - for_each_perag(sc->mp, agno, pag) { + while ((pag = xfs_perag_next(sc->mp, pag))) { error = xrep_bmap_scan_ag(rb, pag); if (error) { xfs_perag_rele(pag); @@ -734,6 +864,7 @@ xrep_bmap_check_inputs( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_UUID: + case XFS_DINODE_FMT_META_BTREE: return -ECANCELED; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -756,10 +887,6 @@ xrep_bmap_check_inputs( return -EINVAL; } - /* Don't know how to rebuild realtime data forks. */ - if (XFS_IS_REALTIME_INODE(sc->ip)) - return -EOPNOTSUPP; - return 0; } @@ -785,10 +912,6 @@ xrep_bmap_init_reflink_scan( if (whichfork != XFS_DATA_FORK) return RLS_IRRELEVANT; - /* cannot share realtime extents */ - if (XFS_IS_REALTIME_INODE(sc->ip)) - return RLS_IRRELEVANT; - return RLS_UNKNOWN; } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 22f5f1a9d3f0..28ad341df8ee 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -34,11 +34,16 @@ #include "xfs_quota.h" #include "xfs_exchmaps.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include "scrub/tempfile.h" /* Common code for the metadata scrubbers. */ @@ -121,6 +126,17 @@ xchk_process_error( } bool +xchk_process_rt_error( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + xfs_rgblock_t rgbno, + int *error) +{ + return __xchk_process_error(sc, rgno, rgbno, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool xchk_xref_process_error( struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -513,7 +529,7 @@ xchk_perag_drain_and_lock( * Obviously, this should be slanted against scrub and in favor * of runtime threads. */ - if (!xfs_perag_intent_busy(sa->pag)) + if (!xfs_group_intent_busy(pag_group(sa->pag))) return 0; if (sa->agf_bp) { @@ -528,7 +544,7 @@ xchk_perag_drain_and_lock( if (!(sc->flags & XCHK_FSGATES_DRAIN)) return -ECHRNG; - error = xfs_perag_intent_drain(sa->pag); + error = xfs_group_intent_drain(pag_group(sa->pag)); if (error == -ERESTARTSYS) error = -EINTR; } while (!error); @@ -683,6 +699,163 @@ xchk_ag_init( return 0; } +#ifdef CONFIG_XFS_RT +/* + * For scrubbing a realtime group, grab all the in-core resources we'll need to + * check the metadata, which means taking the ILOCK of the realtime group's + * metadata inodes. Callers must not join these inodes to the transaction with + * non-zero lockflags or concurrency problems will result. The @rtglock_flags + * argument takes XFS_RTGLOCK_* flags. + */ +int +xchk_rtgroup_init( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + struct xchk_rt *sr) +{ + ASSERT(sr->rtg == NULL); + ASSERT(sr->rtlock_flags == 0); + + sr->rtg = xfs_rtgroup_get(sc->mp, rgno); + if (!sr->rtg) + return -ENOENT; + return 0; +} + +/* Lock all the rt group metadata inode ILOCKs and wait for intents. */ +int +xchk_rtgroup_lock( + struct xfs_scrub *sc, + struct xchk_rt *sr, + unsigned int rtglock_flags) +{ + int error = 0; + + ASSERT(sr->rtg != NULL); + + /* + * If we're /only/ locking the rtbitmap in shared mode, then we're + * obviously not trying to compare records in two metadata inodes. + * There's no need to drain intents here because the caller (most + * likely the rgsuper scanner) doesn't need that level of consistency. + */ + if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) { + xfs_rtgroup_lock(sr->rtg, rtglock_flags); + sr->rtlock_flags = rtglock_flags; + return 0; + } + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + xfs_rtgroup_lock(sr->rtg, rtglock_flags); + + /* + * If we've grabbed a non-metadata file for scrubbing, we + * assume that holding its ILOCK will suffice to coordinate + * with any rt intent chains involving this inode. + */ + if (sc->ip && !xfs_is_internal_inode(sc->ip)) + break; + + /* + * Decide if the rt group is quiet enough for all metadata to + * be consistent with each other. Regular file IO doesn't get + * to lock all the rt inodes at the same time, which means that + * there could be other threads in the middle of processing a + * chain of deferred ops. + * + * We just locked all the metadata inodes for this rt group; + * now take a look to see if there are any intents in progress. + * If there are, drop the rt group inode locks and wait for the + * intents to drain. Since we hold the rt group inode locks + * for the duration of the scrub, this is the only time we have + * to sample the intents counter; any threads increasing it + * after this point can't possibly be in the middle of a chain + * of rt metadata updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_group_intent_busy(rtg_group(sr->rtg))) + break; + + xfs_rtgroup_unlock(sr->rtg, rtglock_flags); + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_group_intent_drain(rtg_group(sr->rtg)); + if (error) { + if (error == -ERESTARTSYS) + error = -EINTR; + return error; + } + } while (1); + + sr->rtlock_flags = rtglock_flags; + + if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP)) + sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); + + if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT)) + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); + + return 0; +} + +/* + * Free all the btree cursors and other incore data relating to the realtime + * group. This has to be done /before/ committing (or cancelling) the scrub + * transaction. + */ +void +xchk_rtgroup_btcur_free( + struct xchk_rt *sr) +{ + if (sr->rmap_cur) + xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR); + if (sr->refc_cur) + xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR); + + sr->refc_cur = NULL; + sr->rmap_cur = NULL; +} + +/* + * Unlock the realtime group. This must be done /after/ committing (or + * cancelling) the scrub transaction. + */ +void +xchk_rtgroup_unlock( + struct xchk_rt *sr) +{ + ASSERT(sr->rtg != NULL); + + if (sr->rtlock_flags) { + xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags); + sr->rtlock_flags = 0; + } +} + +/* + * Unlock the realtime group and release its resources. This must be done + * /after/ committing (or cancelling) the scrub transaction. + */ +void +xchk_rtgroup_free( + struct xfs_scrub *sc, + struct xchk_rt *sr) +{ + ASSERT(sr->rtg != NULL); + + xchk_rtgroup_unlock(sr); + + xfs_rtgroup_put(sr->rtg); + sr->rtg = NULL; +} +#endif /* CONFIG_XFS_RT */ + /* Per-scrubber setup functions */ void @@ -733,6 +906,14 @@ xchk_setup_fs( return xchk_trans_alloc(sc, resblks); } +/* Set us up with a transaction and an empty context to repair rt metadata. */ +int +xchk_setup_rt( + struct xfs_scrub *sc) +{ + return xchk_trans_alloc(sc, xrep_calc_rtgroup_resblks(sc)); +} + /* Set us up with AG headers and btree cursors. */ int xchk_setup_ag_btree( @@ -947,9 +1128,15 @@ xchk_iget_for_scrubbing( if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) return xchk_install_live_inode(sc, ip_in); - /* Reject internal metadata files and obviously bad inode numbers. */ - if (xfs_internal_inum(mp, sc->sm->sm_ino)) + /* + * On pre-metadir filesystems, reject internal metadata files. For + * metadir filesystems, limited scrubbing of any file in the metadata + * directory tree by handle is allowed, because that is the only way to + * validate the lack of parent pointers in the sb-root metadata inodes. + */ + if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) return -ENOENT; + /* Reject obviously bad inode numbers. */ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; @@ -1084,6 +1271,10 @@ xchk_setup_inode_contents( if (error) return error; + error = xrep_tempfile_adjust_directory_tree(sc); + if (error) + return error; + /* Lock the inode so the VFS cannot touch this file. */ xchk_ilock(sc, XFS_IOLOCK_EXCL); @@ -1239,12 +1430,6 @@ xchk_metadata_inode_forks( return 0; } - /* They also should never have extended attributes. */ - if (xfs_inode_hasattr(sc->ip)) { - xchk_ino_set_corrupt(sc, sc->ip->i_ino); - return 0; - } - /* Invoke the data fork scrubber. */ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) @@ -1261,6 +1446,21 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } + /* + * Metadata files can only have extended attributes on metadir + * filesystems, either for parent pointers or for actual xattr data. + */ + if (xfs_inode_hasattr(sc->ip)) { + if (!xfs_has_metadir(sc->mp)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + } + return 0; } @@ -1281,7 +1481,7 @@ xchk_fsgates_enable( trace_xchk_fsgates_enable(sc, scrub_fsgates); if (scrub_fsgates & XCHK_FSGATES_DRAIN) - xfs_drain_wait_enable(); + xfs_defer_drain_wait_enable(); if (scrub_fsgates & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_enable(); @@ -1336,7 +1536,7 @@ xchk_inode_is_allocated( } /* reject inode numbers outside existing AGs */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + ino = xfs_agino_to_ino(pag, agino); if (!xfs_verify_ino(mp, ino)) return -EINVAL; @@ -1446,3 +1646,92 @@ out_rcu: rcu_read_unlock(); return error; } + +/* Is this inode a root directory for either tree? */ +bool +xchk_inode_is_dirtree_root(const struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + return ip == mp->m_rootip || + (xfs_has_metadir(mp) && ip == mp->m_metadirip); +} + +/* Does the superblock point down to this inode? */ +bool +xchk_inode_is_sb_rooted(const struct xfs_inode *ip) +{ + return xchk_inode_is_dirtree_root(ip) || + xfs_is_sb_inum(ip->i_mount, ip->i_ino); +} + +/* What is the root directory inumber for this inode? */ +xfs_ino_t +xchk_inode_rootdir_inum(const struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_is_metadir_inode(ip)) + return mp->m_metadirip->i_ino; + return mp->m_rootip->i_ino; +} + +static int +xchk_meta_btree_count_blocks( + struct xfs_scrub *sc, + xfs_extnum_t *nextents, + xfs_filblks_t *count) +{ + struct xfs_btree_cur *cur; + int error; + + if (!sc->sr.rtg) { + ASSERT(0); + return -EFSCORRUPTED; + } + + switch (sc->ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); + break; + case XFS_METAFILE_RTREFCOUNT: + cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg); + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_btree_count_blocks(cur, count); + xfs_btree_del_cursor(cur, error); + if (!error) { + *nextents = 0; + (*count)--; /* don't count the btree iroot */ + } + return error; +} + +/* Count the blocks used by a file, even if it's a metadata inode. */ +int +xchk_inode_count_blocks( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t *nextents, + xfs_filblks_t *count) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + if (!ifp) { + *nextents = 0; + *count = 0; + return 0; + } + + if (ifp->if_format == XFS_DINODE_FMT_META_BTREE) { + ASSERT(whichfork == XFS_DATA_FORK); + return xchk_meta_btree_count_blocks(sc, nextents, count); + } + + return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents, + count); +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 47148cc4a833..19877d99f255 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -12,6 +12,8 @@ void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); +bool xchk_process_rt_error(struct xfs_scrub *sc, xfs_rgnumber_t rgno, + xfs_rgblock_t rgbno, int *error); bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset, int *error); @@ -61,6 +63,7 @@ static inline int xchk_setup_nothing(struct xfs_scrub *sc) /* Setup functions */ int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); +int xchk_setup_rt(struct xfs_scrub *sc); int xchk_setup_ag_allocbt(struct xfs_scrub *sc); int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); int xchk_setup_ag_rmapbt(struct xfs_scrub *sc); @@ -73,12 +76,19 @@ int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); int xchk_setup_dirtree(struct xfs_scrub *sc); +int xchk_setup_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); +int xchk_setup_rgsuperblock(struct xfs_scrub *sc); +int xchk_setup_rtrmapbt(struct xfs_scrub *sc); +int xchk_setup_rtrefcountbt(struct xfs_scrub *sc); #else # define xchk_setup_rtbitmap xchk_setup_nothing # define xchk_setup_rtsummary xchk_setup_nothing +# define xchk_setup_rgsuperblock xchk_setup_nothing +# define xchk_setup_rtrmapbt xchk_setup_nothing +# define xchk_setup_rtrefcountbt xchk_setup_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); @@ -117,6 +127,41 @@ xchk_ag_init_existing( return error == -ENOENT ? -EFSCORRUPTED : error; } +#ifdef CONFIG_XFS_RT + +/* All the locks we need to check an rtgroup. */ +#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno, + struct xchk_rt *sr); + +static inline int +xchk_rtgroup_init_existing( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + struct xchk_rt *sr) +{ + int error = xchk_rtgroup_init(sc, rgno, sr); + + return error == -ENOENT ? -EFSCORRUPTED : error; +} + +int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr, + unsigned int rtglock_flags); +void xchk_rtgroup_unlock(struct xchk_rt *sr); +void xchk_rtgroup_btcur_free(struct xchk_rt *sr); +void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr); +#else +# define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED) +# define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED) +# define xchk_rtgroup_lock(sc, sr, lockflags) (-EFSCORRUPTED) +# define xchk_rtgroup_unlock(sr) do { } while (0) +# define xchk_rtgroup_btcur_free(sr) do { } while (0) +# define xchk_rtgroup_free(sc, sr) do { } while (0) +#endif /* CONFIG_XFS_RT */ + int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); void xchk_ag_btcur_free(struct xchk_ag *sa); @@ -179,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) bool xchk_dir_looks_zapped(struct xfs_inode *dp); bool xchk_pptr_looks_zapped(struct xfs_inode *ip); -#ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -199,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc) return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED); } -#else -# define xchk_needs_repair(sc) (false) -# define xchk_could_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -216,13 +256,20 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_ag_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ (sc)->mp->m_super->s_id, \ - (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + (sc)->sa.pag ? \ + pag_agno((sc)->sa.pag) : (sc)->sm->sm_agno, \ ##__VA_ARGS__) #define xchk_xfile_ino_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ (sc)->mp->m_super->s_id, \ (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ ##__VA_ARGS__) +#define xchk_xfile_rtgroup_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? \ + rtg_rgno((sc)->sr.rtg) : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take @@ -240,5 +287,11 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, bool *inuse); +int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t *nextents, xfs_filblks_t *count); + +bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip); +bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip); +xfs_ino_t xchk_inode_rootdir_inum(const struct xfs_inode *ip); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 4de3f0f40f48..38a246b8bf11 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -26,6 +26,9 @@ #include "xfs_errortag.h" #include "xfs_icache.h" #include "xfs_refcount_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -34,6 +37,7 @@ #include "scrub/bitmap.h" #include "scrub/off_bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rtb_bitmap.h" #include "scrub/reap.h" /* @@ -61,7 +65,10 @@ struct xrep_cow { struct xoff_bitmap bad_fileoffs; /* Bitmap of fsblocks that were removed from the CoW fork. */ - struct xfsb_bitmap old_cowfork_fsblocks; + union { + struct xfsb_bitmap old_cowfork_fsblocks; + struct xrtb_bitmap old_cowfork_rtblocks; + }; /* CoW fork mappings used to scan for bad CoW staging extents. */ struct xfs_bmbt_irec irec; @@ -137,7 +144,6 @@ xrep_cow_mark_shared_staging( { struct xrep_cow *xc = priv; struct xfs_refcount_irec rrec; - xfs_fsblock_t fsbno; if (!xfs_refcount_check_domain(rec) || rec->rc_domain != XFS_REFC_DOMAIN_SHARED) @@ -145,9 +151,9 @@ xrep_cow_mark_shared_staging( xrep_cow_trim_refcount(xc, &rrec, rec); - fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, - rrec.rc_startblock); - return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); + return xrep_cow_mark_file_range(xc, + xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock), + rrec.rc_blockcount); } /* @@ -178,8 +184,7 @@ xrep_cow_mark_missing_staging( goto next; error = xrep_cow_mark_file_range(xc, - XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, - xc->next_bno), + xfs_gbno_to_fsb(cur->bc_group, xc->next_bno), rrec.rc_startblock - xc->next_bno); if (error) return error; @@ -200,7 +205,6 @@ xrep_cow_mark_missing_staging_rmap( void *priv) { struct xrep_cow *xc = priv; - xfs_fsblock_t fsbno; xfs_agblock_t rec_bno; xfs_extlen_t rec_len; unsigned int adj; @@ -222,8 +226,8 @@ xrep_cow_mark_missing_staging_rmap( rec_len -= adj; } - fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); - return xrep_cow_mark_file_range(xc, fsbno, rec_len); + return xrep_cow_mark_file_range(xc, + xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len); } /* @@ -275,8 +279,7 @@ xrep_cow_find_bad( if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { error = xrep_cow_mark_file_range(xc, - XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, - xc->next_bno), + xfs_agbno_to_fsb(pag, xc->next_bno), xc->irec_startbno + xc->irec.br_blockcount - xc->next_bno); if (error) @@ -312,6 +315,92 @@ out_pag: } /* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad_rt( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_scrub *sc = xc->sc; + struct xfs_rtgroup *rtg; + int error = 0; + + xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock); + + rtg = xfs_rtgroup_get(sc->mp, + xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock)); + if (!rtg) + return -EFSCORRUPTED; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); + if (error) + goto out_rtg; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sr; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sr; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + xfs_rgbno_to_rtb(rtg, xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sr; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sr; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone + * turned on the debugging knob, replace everything in the + * CoW fork and then scan for staging extents in the refcountbt. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + goto out_rtg; + } + +out_sr: + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); +out_rtg: + xfs_rtgroup_put(rtg); + return error; +} + +/* * Allocate a replacement CoW staging extent of up to the given number of * blocks, and fill out the mapping. */ @@ -344,7 +433,7 @@ xrep_cow_alloc( if (args.fsbno == NULLFSBLOCK) return -ENOSPC; - xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len); repl->fsbno = args.fsbno; repl->len = args.len; @@ -352,6 +441,32 @@ xrep_cow_alloc( } /* + * Allocate a replacement rt CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc_rt( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen); + int error; + + error = xfs_trans_reserve_more(sc->tp, 0, maxrtx); + if (error) + return error; + + error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false, + false, &repl->fsbno, &repl->len); + if (error) + return error; + + xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len); + return 0; +} + +/* * Look up the current CoW fork mapping so that we only allocate enough to * replace a single mapping. If we don't find a mapping that covers the start * of the file range, or we find a delalloc or written extent, something is @@ -468,7 +583,10 @@ xrep_cow_replace_range( */ alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, nextoff - startoff); - error = xrep_cow_alloc(sc, alloc_len, &repl); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_cow_alloc_rt(sc, alloc_len, &repl); + else + error = xrep_cow_alloc(sc, alloc_len, &repl); if (error) return error; @@ -484,8 +602,12 @@ xrep_cow_replace_range( return error; /* Note the old CoW staging extents; we'll reap them all later. */ - error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, - repl.len); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks, + got.br_startblock, repl.len); + else + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, + got.br_startblock, repl.len); if (error) return error; @@ -541,8 +663,16 @@ xrep_bmap_cow( if (!ifp) return 0; - /* realtime files aren't supported yet */ - if (XFS_IS_REALTIME_INODE(sc->ip)) + /* + * Realtime files with large extent sizes are not supported because + * we could encounter an CoW mapping that has been partially written + * out *and* requires replacement, and there's no solution to that. + */ + if (xfs_inode_has_bigrtalloc(sc->ip)) + return -EOPNOTSUPP; + + /* Metadata inodes aren't supposed to have data on the rt volume. */ + if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip)) return -EOPNOTSUPP; /* @@ -563,7 +693,10 @@ xrep_bmap_cow( xc->sc = sc; xoff_bitmap_init(&xc->bad_fileoffs); - xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + if (XFS_IS_REALTIME_INODE(sc->ip)) + xrtb_bitmap_init(&xc->old_cowfork_rtblocks); + else + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); for_each_xfs_iext(ifp, &icur, &xc->irec) { if (xchk_should_terminate(sc, &error)) @@ -586,7 +719,10 @@ xrep_bmap_cow( if (xfs_bmap_is_written_extent(&xc->irec)) continue; - error = xrep_cow_find_bad(xc); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_cow_find_bad_rt(xc); + else + error = xrep_cow_find_bad(xc); if (error) goto out_bitmap; } @@ -601,13 +737,20 @@ xrep_bmap_cow( * by the refcount btree, not the inode, so it is correct to treat them * like inode metadata. */ - error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, - &XFS_RMAP_OINFO_COW); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks, + &XFS_RMAP_OINFO_COW); + else + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); if (error) goto out_bitmap; out_bitmap: - xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + if (XFS_IS_REALTIME_INODE(sc->ip)) + xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks); + else + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); xoff_bitmap_destroy(&xc->bad_fileoffs); kfree(xc); return error; diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index bf9199e8df63..c877bde71e62 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -100,6 +100,14 @@ xchk_dir_check_ftype( if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* + * Metadata and regular inodes cannot cross trees. This property + * cannot change without a full inode free and realloc cycle, so it's + * safe to check this without holding locks. + */ + if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(sc->ip)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* @@ -253,7 +261,7 @@ xchk_dir_actor( * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + if (xchk_inode_is_dirtree_root(dp) && ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c index 64679fe08446..249313882108 100644 --- a/fs/xfs/scrub/dir_repair.c +++ b/fs/xfs/scrub/dir_repair.c @@ -415,6 +415,12 @@ xrep_dir_salvage_entry( if (error) return 0; + /* Don't mix metadata and regular directory trees. */ + if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) { + xchk_irele(sc, ip); + return 0; + } + xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); xchk_irele(sc, ip); @@ -1270,7 +1276,7 @@ xrep_dir_scan_dirtree( int error; /* Roots of directory trees are their own parents. */ - if (sc->ip == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(sc->ip)) xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); /* @@ -1632,6 +1638,7 @@ xrep_dir_swap( struct xrep_dir *rd) { struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; bool ip_local, temp_local; int error = 0; @@ -1649,14 +1656,17 @@ xrep_dir_swap( /* * Reset the temporary directory's '..' entry to point to the parent - * that we found. The temporary directory was created with the root - * directory as the parent, so we can skip this if repairing a - * subdirectory of the root. + * that we found. The dirent replace code asserts if the dirent + * already points at the new inumber, so we look it up here. * * It's also possible that this replacement could also expand a sf * tempdir into block format. */ - if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) { + error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino); + if (error) + return error; + + if (rd->pscan.parent_ino != ino) { error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, rd->pscan.parent_ino, rd->tx.req.resblks); if (error) diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c index bde58fb561ea..3a9cdf8738b6 100644 --- a/fs/xfs/scrub/dirtree.c +++ b/fs/xfs/scrub/dirtree.c @@ -362,7 +362,8 @@ xchk_dirpath_set_outcome( STATIC int xchk_dirpath_step_up( struct xchk_dirtree *dl, - struct xchk_dirpath *path) + struct xchk_dirpath *path, + bool is_metadir) { struct xfs_scrub *sc = dl->sc; struct xfs_inode *dp; @@ -435,6 +436,14 @@ xchk_dirpath_step_up( goto out_scanlock; } + /* Parent must be in the same directory tree. */ + if (is_metadir != xfs_is_metadir_inode(dp)) { + trace_xchk_dirpath_crosses_tree(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + /* * If the extended attributes look as though they has been zapped by * the inode record repair code, we cannot scan for parent pointers. @@ -508,6 +517,7 @@ xchk_dirpath_walk_upwards( struct xchk_dirpath *path) { struct xfs_scrub *sc = dl->sc; + bool is_metadir; int error; ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); @@ -538,6 +548,7 @@ xchk_dirpath_walk_upwards( * ILOCK state is no longer tracked in the scrub context. Hence we * must drop @sc->ip's ILOCK during the walk. */ + is_metadir = xfs_is_metadir_inode(sc->ip); mutex_unlock(&dl->lock); xchk_iunlock(sc, XFS_ILOCK_EXCL); @@ -547,7 +558,7 @@ xchk_dirpath_walk_upwards( * If we see any kind of error here (including corruptions), the parent * pointer of @sc->ip is corrupt. Stop the whole scan. */ - error = xchk_dirpath_step_up(dl, path); + error = xchk_dirpath_step_up(dl, path, is_metadir); if (error) { xchk_ilock(sc, XFS_ILOCK_EXCL); mutex_lock(&dl->lock); @@ -560,7 +571,7 @@ xchk_dirpath_walk_upwards( * *somewhere* in the path, but we don't need to stop scanning. */ while (!error && path->outcome == XCHK_DIRPATH_SCANNING) - error = xchk_dirpath_step_up(dl, path); + error = xchk_dirpath_step_up(dl, path, is_metadir); /* Retake the locks we had, mark paths, etc. */ xchk_ilock(sc, XFS_ILOCK_EXCL); @@ -917,7 +928,7 @@ xchk_dirtree( * scan, because the hook doesn't detach until after sc->ip gets * released during teardown. */ - dl->root_ino = sc->mp->m_rootip->i_ino; + dl->root_ino = xchk_inode_rootdir_inum(sc->ip); dl->scan_ino = sc->ip->i_ino; trace_xchk_dirtree_start(sc->ip, sc->sm, 0); @@ -983,3 +994,16 @@ out: trace_xchk_dirtree_done(sc->ip, sc->sm, error); return error; } + +/* Does the directory targetted by this scrub have no parents? */ +bool +xchk_dirtree_parentless(const struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + + if (xchk_inode_is_dirtree_root(sc->ip)) + return true; + if (VFS_I(sc->ip)->i_nlink == 0) + return true; + return false; +} diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h index 1e1686365c61..9e5d95492717 100644 --- a/fs/xfs/scrub/dirtree.h +++ b/fs/xfs/scrub/dirtree.h @@ -156,17 +156,7 @@ struct xchk_dirtree { #define xchk_dirtree_for_each_path(dl, path) \ list_for_each_entry((path), &(dl)->path_list, list) -static inline bool -xchk_dirtree_parentless(const struct xchk_dirtree *dl) -{ - struct xfs_scrub *sc = dl->sc; - - if (sc->ip == sc->mp->m_rootip) - return true; - if (VFS_I(sc->ip)->i_nlink == 0) - return true; - return false; -} +bool xchk_dirtree_parentless(const struct xchk_dirtree *dl); int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl); int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip, diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c index 01766041ba2c..84487072b6dd 100644 --- a/fs/xfs/scrub/findparent.c +++ b/fs/xfs/scrub/findparent.c @@ -172,6 +172,10 @@ xrep_findparent_walk_directory( */ lock_mode = xfs_ilock_data_map_shared(dp); + /* Don't mix metadata and regular directory trees. */ + if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) + goto out_unlock; + /* * If this directory is known to be sick, we cannot scan it reliably * and must abort. @@ -362,15 +366,24 @@ xrep_findparent_confirm( }; int error; - /* - * The root directory always points to itself. Unlinked dirs can point - * anywhere, so we point them at the root dir too. - */ - if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) { + /* The root directory always points to itself. */ + if (sc->ip == sc->mp->m_rootip) { *parent_ino = sc->mp->m_sb.sb_rootino; return 0; } + /* The metadata root directory always points to itself. */ + if (sc->ip == sc->mp->m_metadirip) { + *parent_ino = sc->mp->m_sb.sb_metadirino; + return 0; + } + + /* Unlinked dirs can point anywhere; point them up to the root dir. */ + if (VFS_I(sc->ip)->i_nlink == 0) { + *parent_ino = xchk_inode_rootdir_inum(sc->ip); + return 0; + } + /* Reject garbage parent inode numbers and self-referential parents. */ if (*parent_ino == NULLFSINO) return 0; @@ -412,8 +425,11 @@ xrep_findparent_self_reference( if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino) return sc->mp->m_sb.sb_rootino; + if (sc->ip->i_ino == sc->mp->m_sb.sb_metadirino) + return sc->mp->m_sb.sb_metadirino; + if (VFS_I(sc->ip)->i_nlink == 0) - return sc->mp->m_sb.sb_rootino; + return xchk_inode_rootdir_inum(sc->ip); return NULLFSINO; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 1d3e98346933..9b598c5790ad 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -19,6 +19,7 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_icache.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -74,10 +75,9 @@ xchk_fscount_warmup( struct xfs_buf *agi_bp = NULL; struct xfs_buf *agf_bp = NULL; struct xfs_perag *pag = NULL; - xfs_agnumber_t agno; int error = 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { if (xchk_should_terminate(sc, &error)) break; if (xfs_perag_initialised_agi(pag) && @@ -123,7 +123,7 @@ xchk_fsfreeze( { int error; - error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsfreeze(sc, error); return error; } @@ -135,7 +135,7 @@ xchk_fsthaw( int error; /* This should always succeed, we have a kernel freeze */ - error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsthaw(sc, error); return error; } @@ -261,7 +261,7 @@ xchk_fscount_btreeblks( struct xchk_fscounters *fsc, xfs_agnumber_t agno) { - xfs_extlen_t blocks; + xfs_filblks_t blocks; int error; error = xchk_ag_init_existing(sc, agno, &sc->sa); @@ -295,9 +295,8 @@ xchk_fscount_aggregate_agcounts( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; uint64_t delayed; - xfs_agnumber_t agno; int tries = 8; int error = 0; @@ -306,7 +305,7 @@ retry: fsc->ifree = 0; fsc->fdblocks = 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { if (xchk_should_terminate(sc, &error)) break; @@ -327,7 +326,7 @@ retry: if (xfs_has_lazysbcount(sc->mp)) { fsc->fdblocks += pag->pagf_btreeblks; } else { - error = xchk_fscount_btreeblks(sc, fsc, agno); + error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag)); if (error) break; } @@ -351,7 +350,7 @@ retry: * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -388,7 +387,7 @@ retry: #ifdef CONFIG_XFS_RT STATIC int xchk_fscount_add_frextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -409,26 +408,34 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = NULL; int error; fsc->frextents = 0; fsc->frextents_delayed = 0; - if (!xfs_has_realtime(mp)) + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; - xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_all(sc->mp, sc->tp, - xchk_fscount_add_frextent, fsc); - if (error) { - xchk_set_incomplete(sc); - goto out_unlock; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_all(rtg, sc->tp, + xchk_fscount_add_frextent, fsc); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) { + xchk_set_incomplete(sc); + xfs_rtgroup_rele(rtg); + return error; + } } fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); - -out_unlock: - xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP); - return error; + return 0; } #else STATIC int @@ -512,8 +519,8 @@ xchk_fscounters( /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - frextents = percpu_counter_sum(&mp->m_frextents); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ if (icount < 0 || ifree < 0) @@ -588,15 +595,17 @@ xchk_fscounters( try_again = true; } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) { + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { if (fsc->frozen) xchk_set_corrupt(sc); else try_again = true; } - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index 469bf645dbea..f0d2b04644e4 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -64,19 +64,22 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); /* * Online repair is only supported on v5 file systems, which require - * lazy sb counters and thus no update of sb_fdblocks here. But as of - * now we don't support lazy counting sb_frextents yet, and thus need - * to also update it directly here. And for that we need to keep + * lazy sb counters and thus no update of sb_fdblocks here. But + * sb_frextents only uses a lazy counter with rtgroups, and thus needs + * to be updated directly here otherwise. And for that we need to keep * track of the delalloc reservations separately, as they are are * subtracted from m_frextents, but not included in sb_frextents. */ - percpu_counter_set(&mp->m_frextents, - fsc->frextents - fsc->frextents_delayed); - mp->m_sb.sb_frextents = fsc->frextents; + if (!xfs_has_zoned(mp)) { + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + fsc->frextents - fsc->frextents_delayed); + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; + } return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index b712a8bd34f5..3c0f25098b69 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -12,6 +12,7 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/health.h" #include "scrub/common.h" @@ -70,10 +71,11 @@ /* Map our scrub type to a sick mask and a set of health update functions. */ enum xchk_health_group { - XHG_FS = 1, - XHG_RT, + XHG_NONE = 1, + XHG_FS, XHG_AG, XHG_INO, + XHG_RTGROUP, }; struct xchk_health_map { @@ -82,6 +84,7 @@ struct xchk_health_map { }; static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_PROBE] = { XHG_NONE, 0 }, [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, @@ -100,8 +103,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, - [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, - [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RTGROUP, XFS_SICK_RG_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RTGROUP, XFS_SICK_RG_SUMMARY }, [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, @@ -109,6 +112,10 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, + [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH }, + [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER }, + [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT }, + [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT }, }; /* Return the health status mask for this scrub type. */ @@ -130,7 +137,7 @@ xchk_mark_healthy_if_clean( { if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT))) - sc->sick_mask |= mask; + sc->healthy_mask |= mask; } /* @@ -160,13 +167,14 @@ STATIC void xchk_mark_all_healthy( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); - xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); - for_each_perag(mp, agno, pag) - xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); + while ((pag = xfs_perag_next(mp, pag))) + xfs_group_mark_healthy(pag_group(pag), XFS_SICK_AG_INDIRECT); + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_group_mark_healthy(rtg_group(rtg), XFS_SICK_RG_INDIRECT); } /* @@ -184,6 +192,8 @@ xchk_update_health( struct xfs_scrub *sc) { struct xfs_perag *pag; + struct xfs_rtgroup *rtg; + unsigned int mask = sc->sick_mask; bool bad; /* @@ -198,49 +208,57 @@ xchk_update_health( return; } - if (!sc->sick_mask) - return; - bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT)); + if (!bad) + mask |= sc->healthy_mask; switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_NONE: + break; case XHG_AG: + if (!mask) + return; pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_corrupt(pag, sc->sick_mask); + xfs_group_mark_corrupt(pag_group(pag), mask); else - xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_group_mark_healthy(pag_group(pag), mask); xfs_perag_put(pag); break; case XHG_INO: if (!sc->ip) return; - if (bad) { - unsigned int mask = sc->sick_mask; - - /* - * If we're coming in for repairs then we don't want - * sickness flags to propagate to the incore health - * status if the inode gets inactivated before we can - * fix it. - */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - mask |= XFS_SICK_INO_FORGET; + /* + * If we're coming in for repairs then we don't want sickness + * flags to propagate to the incore health status if the inode + * gets inactivated before we can fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + if (!mask) + return; + if (bad) xfs_inode_mark_corrupt(sc->ip, mask); - } else - xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, mask); break; case XHG_FS: + if (!mask) + return; if (bad) - xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, mask); else - xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + xfs_fs_mark_healthy(sc->mp, mask); break; - case XHG_RT: + case XHG_RTGROUP: + if (!mask) + return; + rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); + xfs_group_mark_corrupt(rtg_group(rtg), mask); else - xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + xfs_group_mark_healthy(rtg_group(rtg), mask); + xfs_rtgroup_put(rtg); break; default: ASSERT(0); @@ -277,7 +295,7 @@ xchk_ag_btree_del_cursor_if_sick( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { + if (xfs_group_has_sickness((*curp)->bc_group, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); *curp = NULL; @@ -294,9 +312,8 @@ xchk_health_record( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; - xfs_agnumber_t agno; - + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; unsigned int sick; unsigned int checked; @@ -304,15 +321,17 @@ xchk_health_record( if (sick & XFS_SICK_FS_PRIMARY) xchk_set_corrupt(sc); - xfs_rt_measure_sickness(mp, &sick, &checked); - if (sick & XFS_SICK_RT_PRIMARY) - xchk_set_corrupt(sc); - - for_each_perag(mp, agno, pag) { - xfs_ag_measure_sickness(pag, &sick, &checked); + while ((pag = xfs_perag_next(mp, pag))) { + xfs_group_measure_sickness(pag_group(pag), &sick, &checked); if (sick & XFS_SICK_AG_PRIMARY) xchk_set_corrupt(sc); } + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + if (sick & XFS_SICK_RG_PRIMARY) + xchk_set_corrupt(sc); + } + return 0; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 750d7b0cd25a..4dc7c83dc08a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -258,7 +258,7 @@ xchk_iallocbt_chunk( { struct xfs_scrub *sc = bs->sc; struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_perag *pag = bs->cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(bs->cur->bc_group); xfs_agblock_t agbno; xfs_extlen_t len; @@ -303,7 +303,6 @@ xchk_iallocbt_check_cluster_ifree( unsigned int irec_ino, struct xfs_dinode *dip) { - struct xfs_mount *mp = bs->cur->bc_mp; xfs_ino_t fsino; xfs_agino_t agino; bool irec_free; @@ -319,7 +318,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino); + fsino = xfs_agino_to_ino(to_perag(bs->cur->bc_group), agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -368,7 +367,6 @@ xchk_iallocbt_check_cluster( struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -396,7 +394,7 @@ xchk_iallocbt_check_cluster( * ir_startino can be large enough to make im_boffset nonzero. */ ir_holemask = (irec->ir_holemask & cluster_mask); - imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_blkno = xfs_agbno_to_daddr(to_perag(bs->cur->bc_group), agbno); imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) << mp->m_sb.sb_inodelog; @@ -407,9 +405,9 @@ xchk_iallocbt_check_cluster( return 0; } - trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, - imap.im_blkno, imap.im_len, cluster_base, nr_inodes, - cluster_mask, ir_holemask, + trace_xchk_iallocbt_check_cluster(to_perag(bs->cur->bc_group), + irec->ir_startino, imap.im_blkno, imap.im_len, + cluster_base, nr_inodes, cluster_mask, ir_holemask, XFS_INO_TO_OFFSET(mp, irec->ir_startino + cluster_base)); @@ -585,7 +583,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_inobt_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -652,8 +650,8 @@ xchk_iallocbt_xref_rmap_btreeblks( struct xfs_scrub *sc) { xfs_filblks_t blocks; - xfs_extlen_t inobt_blocks = 0; - xfs_extlen_t finobt_blocks = 0; + xfs_filblks_t inobt_blocks = 0; + xfs_filblks_t finobt_blocks = 0; int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index c8d2196a04e1..14e48d3f1912 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -146,15 +146,12 @@ xrep_ibt_check_ifree( struct xfs_scrub *sc = ri->sc; struct xfs_mount *mp = sc->mp; struct xfs_dinode *dip; - xfs_ino_t fsino; xfs_agino_t agino; - xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; unsigned int cluster_buf_base; unsigned int offset; int error; agino = cluster_ag_base + cluster_index; - fsino = XFS_AGINO_TO_INO(mp, agno, agino); /* Inode uncached or half assembled, read disk buffer */ cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); @@ -165,7 +162,8 @@ xrep_ibt_check_ifree( if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) return -EFSCORRUPTED; - if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + if (dip->di_version >= 3 && + be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino)) return -EFSCORRUPTED; /* Will the in-core inode tell us if it's in use? */ @@ -194,7 +192,7 @@ xrep_ibt_stash( if (ri->rie.ir_freecount > 0) ri->finobt_recs++; - trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie); error = xfarray_append(ri->inode_records, &ri->rie); if (error) @@ -307,7 +305,7 @@ xrep_ibt_process_cluster( * inobt because imap_to_bp directly maps the buffer without touching * either inode btree. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno); imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); imap.im_boffset = 0; error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); @@ -423,9 +421,7 @@ xrep_ibt_record_inode_blocks( if (error) return error; - trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, - rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, - rec->rm_offset, rec->rm_flags); + trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec); /* * Record the free/hole masks for each inode cluster that could be @@ -634,7 +630,6 @@ xrep_ibt_build_new_trees( struct xfs_scrub *sc = ri->sc; struct xfs_btree_cur *ino_cur; struct xfs_btree_cur *fino_cur = NULL; - xfs_fsblock_t fsbno; bool need_finobt; int error; @@ -656,9 +651,8 @@ xrep_ibt_build_new_trees( * * Start by setting up the inobt staging cursor. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_IBT_BLOCK(sc->mp)); - xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, + xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)), XFS_AG_RESV_NONE); ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; ri->new_inobt.bload.get_records = xrep_ibt_get_records; @@ -677,10 +671,9 @@ xrep_ibt_build_new_trees( if (sc->mp->m_finobt_nores) resv = XFS_AG_RESV_NONE; - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_FIBT_BLOCK(sc->mp)); xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, - fsbno, resv); + xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)), + resv); ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; ri->new_finobt.bload.get_records = xrep_fibt_get_records; @@ -821,7 +814,7 @@ xrep_iallocbt( sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; /* Set up enough storage to handle an AG with nothing but inodes. */ - xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino); last_agino /= XFS_INODES_PER_CHUNK; descr = xchk_xfile_ag_descr(sc, "inode index records"); error = xfarray_create(descr, last_agino, diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index d32716fb2fec..bb3f475b6353 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -60,6 +60,22 @@ xchk_install_handle_iscrub( if (error) return error; + /* + * Don't allow scrubbing by handle of any non-directory inode records + * in the metadata directory tree. We don't know if any of the scans + * launched by this scrubber will end up indirectly trying to lock this + * file. + * + * Scrubbers of inode-rooted metadata files (e.g. quota files) will + * attach all the resources needed to scrub the inode and call + * xchk_inode directly. Userspace cannot call this directly. + */ + if (xfs_is_metadir_inode(ip) && !S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_irele(sc, ip); + sc->ip = NULL; + return -ENOENT; + } + return xchk_prepare_iscrub(sc); } @@ -94,9 +110,15 @@ xchk_setup_inode( return xchk_prepare_iscrub(sc); } - /* Reject internal metadata files and obviously bad inode numbers. */ - if (xfs_internal_inum(mp, sc->sm->sm_ino)) + /* + * On pre-metadir filesystems, reject internal metadata files. For + * metadir filesystems, limited scrubbing of any file in the metadata + * directory tree by handle is allowed, because that is the only way to + * validate the lack of parent pointers in the sb-root metadata inodes. + */ + if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) return -ENOENT; + /* Reject obviously bad inode numbers. */ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; @@ -238,12 +260,7 @@ xchk_inode_extsize( xchk_ino_set_warning(sc, ino); } -/* - * Validate di_cowextsize hint. - * - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). - * These functions must be kept in sync with each other. - */ +/* Validate di_cowextsize hint. */ STATIC void xchk_inode_cowextsize( struct xfs_scrub *sc, @@ -254,12 +271,32 @@ xchk_inode_cowextsize( uint64_t flags2) { xfs_failaddr_t fa; + uint32_t value = be32_to_cpu(dip->di_cowextsize); + + /* + * The used block counter for rtrmap is checked and repaired elsewhere. + */ + if (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) + return; - fa = xfs_inode_validate_cowextsize(sc->mp, - be32_to_cpu(dip->di_cowextsize), mode, flags, - flags2); + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); + + /* + * XFS allows a sysadmin to change the rt extent size when adding a rt + * section to a filesystem after formatting. If there are any + * directories with cowextsize and rtinherit set, the hint could become + * misaligned with the new rextsize. The verifier doesn't check this, + * because we allow rtinherit directories even without an rt device. + * Flag this as an administrative warning since we will clean this up + * eventually. + */ + if ((flags & XFS_DIFLAG_RTINHERIT) && + (flags2 & XFS_DIFLAG2_COWEXTSIZE) && + value % sc->mp->m_sb.sb_rextsize > 0) + xchk_ino_set_warning(sc, ino); } /* Make sure the di_flags make sense for the inode. */ @@ -338,8 +375,9 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) goto bad; - /* realtime and reflink make no sense, currently */ - if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) + /* realtime and reflink don't always go together */ + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) && + !xfs_has_rtreflink(mp)) goto bad; /* no bigtime iflag without the bigtime feature */ @@ -421,8 +459,13 @@ xchk_dinode( break; case 2: case 3: - if (dip->di_onlink != 0) - xchk_ino_set_corrupt(sc, ino); + if (xfs_dinode_is_metadir(dip)) { + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + xchk_ino_set_corrupt(sc, ino); + } else { + if (dip->di_metatype != 0) + xchk_ino_set_corrupt(sc, ino); + } if (dip->di_mode == 0 && sc->ip) xchk_ino_set_corrupt(sc, ino); @@ -475,6 +518,10 @@ xchk_dinode( if (!S_ISREG(mode) && !S_ISDIR(mode)) xchk_ino_set_corrupt(sc, ino); break; + case XFS_DINODE_FMT_META_BTREE: + if (!S_ISREG(mode)) + xchk_ino_set_corrupt(sc, ino); + break; case XFS_DINODE_FMT_UUID: default: xchk_ino_set_corrupt(sc, ino); @@ -659,15 +706,13 @@ xchk_inode_xref_bmap( return; /* Walk all the extents to check nextents/naextents/nblocks. */ - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, - &nextents, &count); + error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count); if (!xchk_should_check_xref(sc, &error, NULL)) return; if (nextents < xfs_dfork_data_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, - &nextents, &acount); + error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount); if (!xchk_should_check_xref(sc, &error, NULL)) return; if (nextents != xfs_dfork_attr_extents(dip)) diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 3e45b9b72312..a90a011c7e5f 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -38,6 +38,9 @@ #include "xfs_log_priv.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -521,10 +524,17 @@ STATIC void xrep_dinode_nlinks( struct xfs_dinode *dip) { - if (dip->di_version > 1) - dip->di_onlink = 0; - else + if (dip->di_version < 2) { dip->di_nlink = 0; + return; + } + + if (xfs_dinode_is_metadir(dip)) { + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN); + } else { + dip->di_metatype = 0; + } } /* Fix any conflicting flags that the verifiers complain about. */ @@ -555,8 +565,6 @@ xrep_dinode_flags( flags2 |= XFS_DIFLAG2_REFLINK; else flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); - if (flags & XFS_DIFLAG_REALTIME) - flags2 &= ~XFS_DIFLAG2_REFLINK; if (!xfs_has_bigtime(mp)) flags2 &= ~XFS_DIFLAG2_BIGTIME; if (!xfs_has_large_extent_counts(mp)) @@ -565,6 +573,16 @@ xrep_dinode_flags( dip->di_nrext64_pad = 0; else if (dip->di_version >= 3) dip->di_v3_pad = 0; + + if (flags2 & XFS_DIFLAG2_METADATA) { + xfs_failaddr_t fa; + + fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags, + flags2); + if (fa) + flags2 &= ~XFS_DIFLAG2_METADATA; + } + dip->di_flags = cpu_to_be16(flags); dip->di_flags2 = cpu_to_be64(flags2); } @@ -692,7 +710,9 @@ xrep_dinode_extsize_hints( XFS_DIFLAG_EXTSZINHERIT); } - if (dip->di_version < 3) + if (dip->di_version < 3 || + (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) return; fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), @@ -756,19 +776,72 @@ xrep_dinode_count_ag_rmaps( return error; } +/* Count extents and blocks for an inode given an rt rmap. */ +STATIC int +xrep_dinode_walk_rtrmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) + return -EFSCORRUPTED; + + ri->rt_blocks += rec->rm_blockcount; + ri->rt_extents++; + return 0; +} + +/* Count extents and blocks for an inode from all realtime rmap data. */ +STATIC int +xrep_dinode_count_rtgroup_rmaps( + struct xrep_inode *ri, + struct xfs_rtgroup *rtg) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap, + ri); + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); + return error; +} + /* Count extents and blocks for a given inode from all rmap data. */ STATIC int xrep_dinode_count_rmaps( struct xrep_inode *ri) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; int error; - if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + if (!xfs_has_rmapbt(ri->sc->mp)) return -EOPNOTSUPP; - for_each_perag(ri->sc->mp, agno, pag) { + while ((rtg = xfs_rtgroup_next(ri->sc->mp, rtg))) { + error = xrep_dinode_count_rtgroup_rmaps(ri, rtg); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + while ((pag = xfs_perag_next(ri->sc->mp, pag))) { error = xrep_dinode_count_ag_rmaps(ri, pag); if (error) { xfs_perag_rele(pag); @@ -872,6 +945,85 @@ xrep_dinode_bad_bmbt_fork( return false; } +/* Return true if this rmap-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_rtrmapbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size) +{ + struct xfs_rtrmap_root *dfp; + unsigned int nrecs; + unsigned int level; + + if (dfork_size < sizeof(struct xfs_rtrmap_root)) + return true; + + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > sc->mp->m_rtrmap_maxlevels) + return true; + if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size) + return true; + if (level > 0 && nrecs == 0) + return true; + + return false; +} + +/* Return true if this refcount-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_rtrefcountbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size) +{ + struct xfs_rtrefcount_root *dfp; + unsigned int nrecs; + unsigned int level; + + if (dfork_size < sizeof(struct xfs_rtrefcount_root)) + return true; + + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > sc->mp->m_rtrefc_maxlevels) + return true; + if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size) + return true; + if (level > 0 && nrecs == 0) + return true; + + return false; +} + +/* Check a metadata-btree fork. */ +STATIC bool +xrep_dinode_bad_metabt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + if (whichfork != XFS_DATA_FORK) + return true; + + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size); + case XFS_METAFILE_RTREFCOUNT: + return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size); + default: + return true; + } + + return false; +} + /* * Check the data fork for things that will fail the ifork verifiers or the * ifork formatters. @@ -905,9 +1057,17 @@ xrep_dinode_check_dfork( return true; break; case S_IFREG: - if (fmt == XFS_DINODE_FMT_LOCAL) + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: return true; - fallthrough; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: + break; + default: + return true; + } + break; case S_IFLNK: case S_IFDIR: switch (fmt) { @@ -952,6 +1112,11 @@ xrep_dinode_check_dfork( XFS_DATA_FORK)) return true; break; + case XFS_DINODE_FMT_META_BTREE: + if (xrep_dinode_bad_metabt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; default: return true; } @@ -1072,6 +1237,11 @@ xrep_dinode_check_afork( XFS_ATTR_FORK)) return true; break; + case XFS_DINODE_FMT_META_BTREE: + if (xrep_dinode_bad_metabt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; default: return true; } @@ -1119,6 +1289,8 @@ xrep_dinode_ensure_forkoff( uint16_t mode) { struct xfs_bmdr_block *bmdr; + struct xfs_rtrmap_root *rmdr; + struct xfs_rtrefcount_root *rcdr; struct xfs_scrub *sc = ri->sc; xfs_extnum_t attr_extents, data_extents; size_t bmdr_minsz = xfs_bmdr_space_calc(1); @@ -1225,6 +1397,21 @@ xrep_dinode_ensure_forkoff( bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); break; + case XFS_DINODE_FMT_META_BTREE: + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr); + break; + case XFS_METAFILE_RTREFCOUNT: + rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr); + break; + default: + dfork_min = 0; + break; + } + break; default: dfork_min = 0; break; @@ -1373,8 +1560,7 @@ xrep_dinode_core( /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, - ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, - NULL); + ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL); if (error) return error; @@ -1484,8 +1670,7 @@ xrep_inode_blockcounts( trace_xrep_inode_blockcounts(sc); /* Set data fork counters from the data fork mappings. */ - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, - &nextents, &count); + error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count); if (error) return error; if (xfs_is_reflink_inode(sc->ip)) { @@ -1509,8 +1694,8 @@ xrep_inode_blockcounts( /* Set attr fork counters from the attr fork mappings. */ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); if (ifp) { - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, - &nextents, &acount); + error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, + &acount); if (error) return error; if (count >= sc->mp->m_sb.sb_dblocks) @@ -1648,10 +1833,6 @@ xrep_inode_flags( /* DAX only applies to files and dirs. */ if (!(S_ISREG(mode) || S_ISDIR(mode))) sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; - - /* No reflink files on the realtime device. */ - if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) - sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; } /* @@ -1755,15 +1936,8 @@ xrep_inode_pptr( if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) return 0; - /* The root directory doesn't have a parent pointer. */ - if (ip == mp->m_rootip) - return 0; - - /* - * Metadata inodes are rooted in the superblock and do not have any - * parents. - */ - if (xfs_is_metadata_inode(ip)) + /* Children of the superblock do not have parent pointers. */ + if (xchk_inode_is_sb_rooted(ip)) return 0; /* Inode already has an attr fork; no further work possible here. */ @@ -1774,6 +1948,20 @@ xrep_inode_pptr( sizeof(struct xfs_attr_sf_hdr), true); } +/* Fix COW extent size hint problems. */ +STATIC void +xrep_inode_cowextsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned CoW extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) { + sc->ip->i_cowextsize = 0; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + } +} + /* Fix any irregularities in an inode that the verifiers don't catch. */ STATIC int xrep_inode_problems( @@ -1797,6 +1985,7 @@ xrep_inode_problems( if (S_ISDIR(VFS_I(sc->ip)->i_mode)) xrep_inode_dir_size(sc); xrep_inode_extsize(sc); + xrep_inode_cowextsize(sc); trace_xrep_inode_fixed(sc); xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index cf9d983667ce..84f117667ca2 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -67,7 +67,7 @@ xchk_iscan_mask_skipino( xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); - if (pag->pag_agno != skip_agno) + if (pag_agno(pag) != skip_agno) return; if (skip_agino < rec->ir_startino) return; @@ -95,7 +95,7 @@ xchk_iscan_find_next( struct xfs_btree_cur *cur; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = sc->tp; - xfs_agnumber_t agno = pag->pag_agno; + xfs_agnumber_t agno = pag_agno(pag); xfs_agino_t lastino = NULLAGINO; xfs_agino_t first, last; xfs_agino_t agino = *cursor; diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c new file mode 100644 index 000000000000..e21c16fbd15d --- /dev/null +++ b/fs/xfs/scrub/metapath.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_metafile.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_attr.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" +#include "scrub/repair.h" + +/* + * Metadata Directory Tree Paths + * ============================= + * + * A filesystem with metadir enabled expects to find metadata structures + * attached to files that are accessible by walking a path down the metadata + * directory tree. Given the metadir path and the incore inode storing the + * metadata, this scrubber ensures that the ondisk metadir path points to the + * ondisk inode represented by the incore inode. + */ + +struct xchk_metapath { + struct xfs_scrub *sc; + + /* Name for lookup */ + struct xfs_name xname; + + /* Directory update for repairs */ + struct xfs_dir_update du; + + /* Path down to this metadata file from the parent directory */ + const char *path; + + /* Directory parent of the metadata file. */ + struct xfs_inode *dp; + + /* Locks held on dp */ + unsigned int dp_ilock_flags; + + /* Transaction block reservations */ + unsigned int link_resblks; + unsigned int unlink_resblks; + + /* Parent pointer updates */ + struct xfs_parent_args link_ppargs; + struct xfs_parent_args unlink_ppargs; + + /* Scratchpads for removing links */ + struct xfs_da_args pptr_args; +}; + +/* Release resources tracked in the buffer. */ +static inline void +xchk_metapath_cleanup( + void *buf) +{ + struct xchk_metapath *mpath = buf; + + if (mpath->dp_ilock_flags) + xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); + kfree(mpath->path); +} + +/* Set up a metadir path scan. @path must be dynamically allocated. */ +static inline int +xchk_setup_metapath_scan( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const char *path, + struct xfs_inode *ip) +{ + struct xchk_metapath *mpath; + int error; + + if (!path) + return -ENOMEM; + + error = xchk_install_live_inode(sc, ip); + if (error) { + kfree(path); + return error; + } + + mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); + if (!mpath) { + kfree(path); + return -ENOMEM; + } + + mpath->sc = sc; + sc->buf = mpath; + sc->buf_cleanup = xchk_metapath_cleanup; + + mpath->dp = dp; + mpath->path = path; /* path is now owned by mpath */ + + mpath->xname.name = mpath->path; + mpath->xname.len = strlen(mpath->path); + mpath->xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + + return 0; +} + +#ifdef CONFIG_XFS_RT +/* Scan the /rtgroups directory itself. */ +static int +xchk_setup_metapath_rtdir( + struct xfs_scrub *sc) +{ + if (!sc->mp->m_rtdirip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); +} + +/* Scan a rtgroup inode under the /rtgroups directory. */ +static int +xchk_setup_metapath_rtginode( + struct xfs_scrub *sc, + enum xfs_rtg_inodes type) +{ + struct xfs_rtgroup *rtg; + struct xfs_inode *ip; + int error; + + rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); + if (!rtg) + return -ENOENT; + + ip = rtg->rtg_inodes[type]; + if (!ip) { + error = -ENOENT; + goto out_put_rtg; + } + + error = xchk_setup_metapath_scan(sc, sc->mp->m_rtdirip, + xfs_rtginode_path(rtg_rgno(rtg), type), ip); + +out_put_rtg: + xfs_rtgroup_put(rtg); + return error; +} +#else +# define xchk_setup_metapath_rtdir(...) (-ENOENT) +# define xchk_setup_metapath_rtginode(...) (-ENOENT) +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +/* Scan the /quota directory itself. */ +static int +xchk_setup_metapath_quotadir( + struct xfs_scrub *sc) +{ + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + + if (!qi || !qi->qi_dirip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kstrdup("quota", GFP_KERNEL), qi->qi_dirip); +} + +/* Scan a quota inode under the /quota directory. */ +static int +xchk_setup_metapath_dqinode( + struct xfs_scrub *sc, + xfs_dqtype_t type) +{ + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_inode *ip = NULL; + + if (!qi) + return -ENOENT; + + switch (type) { + case XFS_DQTYPE_USER: + ip = qi->qi_uquotaip; + break; + case XFS_DQTYPE_GROUP: + ip = qi->qi_gquotaip; + break; + case XFS_DQTYPE_PROJ: + ip = qi->qi_pquotaip; + break; + default: + ASSERT(0); + return -EINVAL; + } + if (!ip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, qi->qi_dirip, + kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); +} +#else +# define xchk_setup_metapath_quotadir(...) (-ENOENT) +# define xchk_setup_metapath_dqinode(...) (-ENOENT) +#endif /* CONFIG_XFS_QUOTA */ + +int +xchk_setup_metapath( + struct xfs_scrub *sc) +{ + if (!xfs_has_metadir(sc->mp)) + return -ENOENT; + if (sc->sm->sm_gen) + return -EINVAL; + + switch (sc->sm->sm_ino) { + case XFS_SCRUB_METAPATH_PROBE: + /* Just probing, nothing else to do. */ + if (sc->sm->sm_agno) + return -EINVAL; + return 0; + case XFS_SCRUB_METAPATH_RTDIR: + return xchk_setup_metapath_rtdir(sc); + case XFS_SCRUB_METAPATH_RTBITMAP: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_BITMAP); + case XFS_SCRUB_METAPATH_RTSUMMARY: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_SUMMARY); + case XFS_SCRUB_METAPATH_QUOTADIR: + return xchk_setup_metapath_quotadir(sc); + case XFS_SCRUB_METAPATH_USRQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_USER); + case XFS_SCRUB_METAPATH_GRPQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP); + case XFS_SCRUB_METAPATH_PRJQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ); + case XFS_SCRUB_METAPATH_RTRMAPBT: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP); + case XFS_SCRUB_METAPATH_RTREFCOUNTBT: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT); + default: + return -ENOENT; + } +} + +/* + * Take the ILOCK on the metadata directory parent and child. We do not know + * that the metadata directory is not corrupt, so we lock the parent and try + * to lock the child. Returns 0 if successful, or -EINTR to abort the scrub. + */ +STATIC int +xchk_metapath_ilock_both( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + int error = 0; + + while (true) { + xfs_ilock(mpath->dp, XFS_ILOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_ILOCK_EXCL)) { + mpath->dp_ilock_flags |= XFS_ILOCK_EXCL; + return 0; + } + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); + } + + ASSERT(0); + return -EINTR; +} + +/* Unlock parent and child inodes. */ +static inline void +xchk_metapath_iunlock( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + mpath->dp_ilock_flags &= ~XFS_ILOCK_EXCL; + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); +} + +int +xchk_metapath( + struct xfs_scrub *sc) +{ + struct xchk_metapath *mpath = sc->buf; + xfs_ino_t ino = NULLFSINO; + int error; + + /* Just probing, nothing else to do. */ + if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE) + return 0; + + /* Parent required to do anything else. */ + if (mpath->dp == NULL) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + error = xchk_metapath_ilock_both(mpath); + if (error) + goto out_cancel; + + /* Make sure the parent dir has a dirent pointing to this file. */ + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xchk_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* No directory entry at all */ + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_ilock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_ilock; + if (ino != sc->ip->i_ino) { + /* Pointing to wrong inode */ + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + +out_ilock: + xchk_metapath_iunlock(mpath); +out_cancel: + xchk_trans_cancel(sc); + return error; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Create the dirent represented by the final component of the path. */ +STATIC int +xrep_metapath_link( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + + mpath->du.dp = mpath->dp; + mpath->du.name = &mpath->xname; + mpath->du.ip = sc->ip; + + if (xfs_has_parent(sc->mp)) + mpath->du.ppargs = &mpath->link_ppargs; + else + mpath->du.ppargs = NULL; + + trace_xrep_metapath_link(sc, mpath->path, mpath->dp, sc->ip->i_ino); + + return xfs_dir_add_child(sc->tp, mpath->link_resblks, &mpath->du); +} + +/* Remove the dirent at the final component of the path. */ +STATIC int +xrep_metapath_unlink( + struct xchk_metapath *mpath, + xfs_ino_t ino, + struct xfs_inode *ip) +{ + struct xfs_parent_rec rec; + struct xfs_scrub *sc = mpath->sc; + struct xfs_mount *mp = sc->mp; + int error; + + trace_xrep_metapath_unlink(sc, mpath->path, mpath->dp, ino); + + if (!ip) { + /* The child inode isn't allocated. Junk the dirent. */ + xfs_trans_log_inode(sc->tp, mpath->dp, XFS_ILOG_CORE); + return xfs_dir_removename(sc->tp, mpath->dp, &mpath->xname, + ino, mpath->unlink_resblks); + } + + mpath->du.dp = mpath->dp; + mpath->du.name = &mpath->xname; + mpath->du.ip = ip; + mpath->du.ppargs = NULL; + + /* Figure out if we're removing a parent pointer too. */ + if (xfs_has_parent(mp)) { + xfs_inode_to_parent_rec(&rec, ip); + error = xfs_parent_lookup(sc->tp, ip, &mpath->xname, &rec, + &mpath->pptr_args); + switch (error) { + case -ENOATTR: + break; + case 0: + mpath->du.ppargs = &mpath->unlink_ppargs; + break; + default: + return error; + } + } + + return xfs_dir_remove_child(sc->tp, mpath->unlink_resblks, &mpath->du); +} + +/* + * Try to create a dirent in @mpath->dp with the name @mpath->xname that points + * to @sc->ip. Returns: + * + * -EEXIST and an @alleged_child if the dirent that points to the wrong inode; + * 0 if there is now a dirent pointing to @sc->ip; or + * A negative errno on error. + */ +STATIC int +xrep_metapath_try_link( + struct xchk_metapath *mpath, + xfs_ino_t *alleged_child) +{ + struct xfs_scrub *sc = mpath->sc; + xfs_ino_t ino; + int error; + + /* Allocate transaction, lock inodes, join to transaction. */ + error = xchk_trans_alloc(sc, mpath->link_resblks); + if (error) + return error; + + error = xchk_metapath_ilock_both(mpath); + if (error) { + xchk_trans_cancel(sc); + return error; + } + xfs_trans_ijoin(sc->tp, mpath->dp, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* + * There is no dirent in the directory. Create an entry + * pointing to @sc->ip. + */ + error = xrep_metapath_link(mpath); + if (error) + goto out_cancel; + + error = xrep_trans_commit(sc); + xchk_metapath_iunlock(mpath); + return error; + } + if (error) + goto out_cancel; + + if (ino == sc->ip->i_ino) { + /* The dirent already points to @sc->ip; we're done. */ + error = 0; + goto out_cancel; + } + + /* + * The dirent points elsewhere; pass that back so that the caller + * can try to remove the dirent. + */ + *alleged_child = ino; + error = -EEXIST; + +out_cancel: + xchk_trans_cancel(sc); + xchk_metapath_iunlock(mpath); + return error; +} + +/* + * Take the ILOCK on the metadata directory parent and a bad child, if one is + * supplied. We do not know that the metadata directory is not corrupt, so we + * lock the parent and try to lock the child. Returns 0 if successful, or + * -EINTR to abort the repair. The lock state of @dp is not recorded in @mpath. + */ +STATIC int +xchk_metapath_ilock_parent_and_child( + struct xchk_metapath *mpath, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = mpath->sc; + int error = 0; + + while (true) { + xfs_ilock(mpath->dp, XFS_ILOCK_EXCL); + if (!ip || xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return 0; + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); + } + + ASSERT(0); + return -EINTR; +} + +/* + * Try to remove a dirent in @mpath->dp with the name @mpath->xname that points + * to @alleged_child. Returns: + * + * 0 if there is no longer a dirent; + * -EEXIST if the dirent points to @sc->ip; + * -EAGAIN and an updated @alleged_child if the dirent points elsewhere; or + * A negative errno for any other error. + */ +STATIC int +xrep_metapath_try_unlink( + struct xchk_metapath *mpath, + xfs_ino_t *alleged_child) +{ + struct xfs_scrub *sc = mpath->sc; + struct xfs_inode *ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(*alleged_child != sc->ip->i_ino); + + trace_xrep_metapath_try_unlink(sc, mpath->path, mpath->dp, + *alleged_child); + + /* + * Allocate transaction, grab the alleged child inode, lock inodes, + * join to transaction. + */ + error = xchk_trans_alloc(sc, mpath->unlink_resblks); + if (error) + return error; + + error = xchk_iget(sc, *alleged_child, &ip); + if (error == -EINVAL || error == -ENOENT) { + /* inode number is bogus, junk the dirent */ + error = 0; + } + if (error) { + xchk_trans_cancel(sc); + return error; + } + + error = xchk_metapath_ilock_parent_and_child(mpath, ip); + if (error) { + xchk_trans_cancel(sc); + return error; + } + xfs_trans_ijoin(sc->tp, mpath->dp, 0); + if (ip) + xfs_trans_ijoin(sc->tp, ip, 0); + + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* + * There is no dirent in the directory anymore. We're ready to + * try the link operation again. + */ + error = 0; + goto out_cancel; + } + if (error) + goto out_cancel; + + if (ino == sc->ip->i_ino) { + /* The dirent already points to @sc->ip; we're done. */ + error = -EEXIST; + goto out_cancel; + } + + /* + * The dirent does not point to the alleged child. Update the caller + * and signal that we want to be called again. + */ + if (ino != *alleged_child) { + *alleged_child = ino; + error = -EAGAIN; + goto out_cancel; + } + + /* Remove the link to the child. */ + error = xrep_metapath_unlink(mpath, ino, ip); + if (error) + goto out_cancel; + + error = xrep_trans_commit(sc); + goto out_unlock; + +out_cancel: + xchk_trans_cancel(sc); +out_unlock: + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xchk_irele(sc, ip); + } + return error; +} + +/* + * Make sure the metadata directory path points to the child being examined. + * + * Repair needs to be able to create a directory structure, create its own + * transactions, and take ILOCKs. This function /must/ be called after all + * other repairs have completed. + */ +int +xrep_metapath( + struct xfs_scrub *sc) +{ + struct xchk_metapath *mpath = sc->buf; + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Just probing, nothing to repair. */ + if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE) + return 0; + + /* Parent required to do anything else. */ + if (mpath->dp == NULL) + return -EFSCORRUPTED; + + /* + * Make sure the child file actually has an attr fork to receive a new + * parent pointer if the fs has parent pointers. + */ + if (xfs_has_parent(mp)) { + error = xfs_attr_add_fork(sc->ip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + } + + /* Compute block reservation required to unlink and link a file. */ + mpath->unlink_resblks = xfs_remove_space_res(mp, MAXNAMELEN); + mpath->link_resblks = xfs_link_space_res(mp, MAXNAMELEN); + + do { + xfs_ino_t alleged_child; + + /* Re-establish the link, or tell us which inode to remove. */ + error = xrep_metapath_try_link(mpath, &alleged_child); + if (!error) + return 0; + if (error != -EEXIST) + return error; + + /* + * Remove an incorrect link to an alleged child, or tell us + * which inode to remove. + */ + do { + error = xrep_metapath_try_unlink(mpath, &alleged_child); + } while (error == -EAGAIN); + if (error == -EEXIST) { + /* Link established; we're done. */ + error = 0; + break; + } + } while (!error); + + return error; +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 2aa14b7ab630..1588ce971cb8 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -19,6 +19,8 @@ #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_metafile.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -58,9 +60,9 @@ xrep_newbt_estimate_slack( if (sc->ops->type == ST_PERAG) { free = sc->sa.pag->pagf_freeblks; - sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { - free = percpu_counter_sum(&sc->mp->m_fdblocks); + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); sz = sc->mp->m_sb.sb_dblocks; } @@ -121,6 +123,43 @@ xrep_newbt_init_inode( } /* + * Initialize accounting resources for staging a new metadata inode btree. + * If the metadata file has a space reservation, the caller must adjust that + * reservation when committing the new ondisk btree. + */ +int +xrep_newbt_init_metadir_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_ifork *ifp; + + ASSERT(xfs_is_metadir_inode(sc->ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + /* + * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the + * inode metadata space reservations can only account allocated space + * to the i_nblocks. We do not want to change the inode core fields + * until we're ready to commit the new tree, so we allocate the blocks + * as if they were regular file blocks. This exposes us to a higher + * risk of the repair being cancelled due to ENOSPC. + */ + xrep_newbt_init_ag(xnr, sc, &oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK); + return 0; +} + +/* * Initialize accounting resources for staging a new btree. Callers are * expected to add their own reservations (and clean them up) manually. */ @@ -186,11 +225,10 @@ xrep_newbt_add_extent( xfs_agblock_t agbno, xfs_extlen_t len) { - struct xfs_mount *mp = xnr->sc->mp; struct xfs_alloc_arg args = { .tp = NULL, /* no autoreap */ .oinfo = xnr->oinfo, - .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .fsbno = xfs_agbno_to_fsb(pag, agbno), .len = len, .resv = xnr->resv, }; @@ -206,12 +244,12 @@ xrep_newbt_validate_ag_alloc_hint( struct xfs_scrub *sc = xnr->sc; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); - if (agno == sc->sa.pag->pag_agno && + if (agno == pag_agno(sc->sa.pag) && xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) return; - xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_AGFL_BLOCK(sc->mp) + 1); + xnr->alloc_hint = + xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); } /* Allocate disk space for a new per-AG btree. */ @@ -225,6 +263,7 @@ xrep_newbt_alloc_ag_blocks( int error = 0; ASSERT(sc->sa.pag != NULL); + ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); while (nr_blocks > 0) { struct xfs_alloc_arg args = { @@ -251,16 +290,15 @@ xrep_newbt_alloc_ag_blocks( return -ENOSPC; agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + if (agno != pag_agno(sc->sa.pag)) { + ASSERT(agno == pag_agno(sc->sa.pag)); + return -EFSCORRUPTED; + } - trace_xrep_newbt_alloc_ag_blocks(mp, agno, + trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, xnr->oinfo.oi_owner); - if (agno != sc->sa.pag->pag_agno) { - ASSERT(agno == sc->sa.pag->pag_agno); - return -EFSCORRUPTED; - } - error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); if (error) return error; @@ -299,6 +337,8 @@ xrep_newbt_alloc_file_blocks( struct xfs_mount *mp = sc->mp; int error = 0; + ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); + while (nr_blocks > 0) { struct xfs_alloc_arg args = { .tp = sc->tp, @@ -326,16 +366,16 @@ xrep_newbt_alloc_file_blocks( agno = XFS_FSB_TO_AGNO(mp, args.fsbno); - trace_xrep_newbt_alloc_file_blocks(mp, agno, - XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, - xnr->oinfo.oi_owner); - pag = xfs_perag_get(mp, agno); if (!pag) { ASSERT(0); return -EFSCORRUPTED; } + trace_xrep_newbt_alloc_file_blocks(pag, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + error = xrep_newbt_add_blocks(xnr, pag, &args); xfs_perag_put(pag); if (error) @@ -376,7 +416,6 @@ xrep_newbt_free_extent( struct xfs_scrub *sc = xnr->sc; xfs_agblock_t free_agbno = resv->agbno; xfs_extlen_t free_aglen = resv->len; - xfs_fsblock_t fsbno; int error; if (!btree_committed || resv->used == 0) { @@ -385,8 +424,8 @@ xrep_newbt_free_extent( * space reservation, let the existing EFI free the entire * space extent. */ - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, - free_agbno, free_aglen, xnr->oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + xnr->oinfo.oi_owner); xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); return 1; } @@ -403,8 +442,8 @@ xrep_newbt_free_extent( if (free_aglen == 0) return 0; - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, - free_aglen, xnr->oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + xnr->oinfo.oi_owner); ASSERT(xnr->resv != XFS_AG_RESV_AGFL); ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); @@ -413,9 +452,9 @@ xrep_newbt_free_extent( * Use EFIs to free the reservations. This reduces the chance * that we leak blocks if the system goes down. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); - error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, - xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); + error = xfs_free_extent_later(sc->tp, + xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, + &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -516,7 +555,6 @@ xrep_newbt_claim_block( union xfs_btree_ptr *ptr) { struct xrep_newbt_resv *resv; - struct xfs_mount *mp = cur->bc_mp; xfs_agblock_t agbno; /* @@ -541,12 +579,10 @@ xrep_newbt_claim_block( if (resv->used == resv->len) list_move_tail(&resv->list, &xnr->resv_list); - trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, - xnr->oinfo.oi_owner); + trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, - agbno)); + ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); else ptr->s = cpu_to_be32(agbno); diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index 3d804d31af24..5ce785599287 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -63,6 +63,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, enum xfs_ag_resv_type resv); int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc); int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len); diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 80aee30886c4..4a47d0aabf73 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -279,7 +279,7 @@ xchk_nlinks_collect_dirent( * determine the backref count. */ if (dotdot) { - if (dp == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(dp)) error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); else if (!xfs_has_parent(sc->mp)) error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); @@ -735,7 +735,7 @@ xchk_nlinks_compare_inode( } } - if (ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(ip)) { /* * For the root of a directory tree, both the '.' and '..' * entries should point to the root directory. The dotdot diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index b3e707f47b7b..4ebdee095428 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -60,11 +60,9 @@ xrep_nlinks_is_orphaned( unsigned int actual_nlink, const struct xchk_nlink *obs) { - struct xfs_mount *mp = ip->i_mount; - if (obs->parents != 0) return false; - if (ip == mp->m_rootip || ip == sc->orphanage) + if (xchk_inode_is_dirtree_root(ip) || ip == sc->orphanage) return false; return actual_nlink != 0; } diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 7148d8362db8..9c12cb844231 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -153,8 +153,7 @@ xrep_orphanage_create( /* Try to find the orphanage directory. */ inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, - strlen(ORPHANAGE)); + orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); goto out_unlock_root; @@ -167,10 +166,11 @@ xrep_orphanage_create( * directory to control access to a file we put in here. */ if (d_really_is_negative(orphanage_dentry)) { - error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, - 0750); - if (error) - goto out_dput_orphanage; + orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, + orphanage_dentry, 0750); + error = PTR_ERR(orphanage_dentry); + if (IS_ERR(orphanage_dentry)) + goto out_unlock_root; } /* Not a directory? Bail out. */ @@ -295,7 +295,9 @@ xrep_orphanage_can_adopt( return false; if (sc->ip == sc->orphanage) return false; - if (xfs_internal_inum(sc->mp, sc->ip->i_ino)) + if (xchk_inode_is_sb_rooted(sc->ip)) + return false; + if (xfs_is_internal_inode(sc->ip)) return false; return true; } @@ -442,7 +444,7 @@ xrep_adoption_check_dcache( if (!d_orphanage) return 0; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); if (d_child) { trace_xrep_adoption_check_child(sc->mp, d_child); @@ -479,7 +481,7 @@ xrep_adoption_zap_dcache( if (!d_orphanage) return; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); while (d_child != NULL) { trace_xrep_adoption_invalidate_child(sc->mp, d_child); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 91e7b51ce068..3b692c4acc1e 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -132,6 +132,14 @@ xchk_parent_validate( return 0; } + /* Is this the metadata root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_metadirip) { + if (sc->ip->i_ino != mp->m_sb.sb_metadirino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* '..' must not point to ourselves. */ if (sc->ip->i_ino == parent_ino) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); @@ -185,6 +193,12 @@ xchk_parent_validate( goto out_unlock; } + /* Metadata and regular inodes cannot cross trees. */ + if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -300,7 +314,7 @@ xchk_parent_pptr_and_dotdot( } /* Is this the root dir? Then '..' must point to itself. */ - if (sc->ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(sc->ip)) { if (sc->ip->i_ino != pp->parent_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; @@ -711,7 +725,7 @@ xchk_parent_count_pptrs( } if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { - if (sc->ip == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(sc->ip)) pp->pptrs_found++; if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0) @@ -720,6 +734,14 @@ xchk_parent_count_pptrs( pp->pptrs_found == 0) xchk_ino_set_corrupt(sc, sc->ip->i_ino); } else { + /* + * Starting with metadir, we allow checking of parent pointers + * of non-directory files that are children of the superblock. + * Pretend that we found a parent pointer attr. + */ + if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip)) + pp->pptrs_found++; + if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found) xchk_ino_set_corrupt(sc, sc->ip->i_ino); } @@ -885,10 +907,9 @@ bool xchk_pptr_looks_zapped( struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); - ASSERT(xfs_has_parent(mp)); + ASSERT(xfs_has_parent(ip->i_mount)); /* * Temporary files that cannot be linked into the directory tree do not @@ -902,15 +923,15 @@ xchk_pptr_looks_zapped( * of a parent pointer scan is always the empty set. It's safe to scan * them even if the attr fork was zapped. */ - if (ip == mp->m_rootip) + if (xchk_inode_is_dirtree_root(ip)) return false; /* - * Metadata inodes are all rooted in the superblock and do not have - * any parents. Hence the attr fork will not be initialized, but - * there are no parent pointers that might have been zapped. + * Metadata inodes that are rooted in the superblock do not have any + * parents. Hence the attr fork will not be initialized, but there are + * no parent pointers that might have been zapped. */ - if (xfs_is_metadata_inode(ip)) + if (xchk_inode_is_sb_rooted(ip)) return false; /* diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c index 7b42b7f65a0b..31bfe10be22a 100644 --- a/fs/xfs/scrub/parent_repair.c +++ b/fs/xfs/scrub/parent_repair.c @@ -1334,7 +1334,7 @@ xrep_parent_rebuild_pptrs( * so that we can decide if we're moving this file to the orphanage. * For this purpose, root directories are their own parents. */ - if (sc->ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(sc->ip)) { xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino); } else { error = xrep_parent_lookup_pptrs(sc, &parent_ino); @@ -1354,21 +1354,40 @@ STATIC int xrep_parent_rebuild_tree( struct xrep_parent *rp) { + struct xfs_scrub *sc = rp->sc; + bool try_adoption; int error; - if (xfs_has_parent(rp->sc->mp)) { + if (xfs_has_parent(sc->mp)) { error = xrep_parent_rebuild_pptrs(rp); if (error) return error; } - if (rp->pscan.parent_ino == NULLFSINO) { - if (xrep_orphanage_can_adopt(rp->sc)) + /* + * Any file with no parent could be adopted. This check happens after + * rebuilding the parent pointer structure because we might have cycled + * the ILOCK during that process. + */ + try_adoption = rp->pscan.parent_ino == NULLFSINO; + + /* + * Starting with metadir, we allow checking of parent pointers + * of non-directory files that are children of the superblock. + * Lack of parent is ok here. + */ + if (try_adoption && xfs_has_metadir(sc->mp) && + xchk_inode_is_sb_rooted(sc->ip)) + try_adoption = false; + + if (try_adoption) { + if (xrep_orphanage_can_adopt(sc)) return xrep_parent_move_to_orphanage(rp); return -EFSCORRUPTED; + } - if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode)) + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) return xrep_parent_reset_dotdot(rp); return 0; @@ -1422,6 +1441,14 @@ xrep_parent_set_nondir_nlink( if (error) return error; + /* + * Starting with metadir, we allow checking of parent pointers of + * non-directory files that are children of the superblock. Pretend + * that we found a parent pointer attr. + */ + if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip)) + rp->parents++; + if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) { xfs_trans_ijoin(sc->tp, sc->ip, 0); joined = true; diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 183d531875ea..58d6d4ed2853 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -212,12 +212,18 @@ xchk_quota_item( if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, + offset); } else { if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + offset); } - if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) + if (dq->q_ino.count > fs_icount) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index cd51f10f2920..8f4c8d41f308 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -233,7 +233,7 @@ xrep_quota_item( rqi->need_quotacheck = true; dirty = true; } - if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) { dq->q_rtb.reserved -= dq->q_rtb.count; dq->q_rtb.reserved += mp->m_sb.sb_rblocks; dq->q_rtb.count = mp->m_sb.sb_rblocks; diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index c77eb2de8df7..dc4033b91e44 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -398,10 +398,13 @@ xqcheck_collect_inode( bool isreg = S_ISREG(VFS_I(ip)->i_mode); int error = 0; - if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + if (xfs_is_metadir_inode(ip) || + xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { /* * Quota files are never counted towards quota, so we do not - * need to take the lock. + * need to take the lock. Files do not switch between the + * metadata and regular directory trees without a reallocation, + * so we do not need to ILOCK them either. */ xchk_iscan_mark_visited(&xqc->iscan, ip); return 0; diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 53697f3c5e1b..8703897c0a9c 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -33,6 +33,9 @@ #include "xfs_attr.h" #include "xfs_attr_remote.h" #include "xfs_defer.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -40,6 +43,7 @@ #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rtb_bitmap.h" #include "scrub/reap.h" /* @@ -137,7 +141,7 @@ xreap_put_freelist( agfl_bp, agbno, 0); if (error) return error; - xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; @@ -263,7 +267,6 @@ xreap_agextent_binval( struct xfs_scrub *sc = rs->sc; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; xfs_agblock_t agbno_next = agbno + *aglenp; xfs_agblock_t bno = agbno; @@ -284,7 +287,7 @@ xreap_agextent_binval( */ while (bno < agbno_next) { struct xrep_bufscan scan = { - .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .daddr = xfs_agbno_to_daddr(pag, bno), .max_sectors = xrep_bufscan_max_sectors(mp, agbno_next - bno), .daddr_step = XFS_FSB_TO_BB(mp, 1), @@ -311,7 +314,7 @@ xreap_agextent_binval( } out: - trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); + trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp); } /* @@ -370,7 +373,8 @@ xreap_agextent_select( out_found: *aglenp = len; - trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); + trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len, + *crosslinked); out_cur: xfs_btree_del_cursor(cur, error); return error; @@ -391,7 +395,9 @@ xreap_agextent_iter( xfs_fsblock_t fsbno; int error = 0; - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + ASSERT(rs->resv != XFS_AG_RESV_METAFILE); + + fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno); /* * If there are other rmappings, this block is cross linked and must @@ -407,7 +413,8 @@ xreap_agextent_iter( * to run xfs_repair. */ if (crosslinked) { - trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, + *aglenp); rs->force_roll = true; @@ -417,7 +424,8 @@ xreap_agextent_iter( * records from the refcountbt, which will remove the * rmap record as well. */ - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, + *aglenp); return 0; } @@ -425,7 +433,7 @@ xreap_agextent_iter( *aglenp, rs->oinfo); } - trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); /* * Invalidate as many buffers as we can, starting at agbno. If this @@ -449,7 +457,7 @@ xreap_agextent_iter( if (rs->oinfo == &XFS_RMAP_OINFO_COW) { ASSERT(rs->resv == XFS_AG_RESV_NONE); - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) @@ -676,6 +684,266 @@ xrep_reap_fsblocks( return 0; } +#ifdef CONFIG_XFS_RT +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. Units are rt blocks, not rt extents. + */ +STATIC int +xreap_rgextent_select( + struct xreap_state *rs, + xfs_rgblock_t rgbno, + xfs_rgblock_t rgbno_next, + bool *crosslinked, + xfs_extlen_t *rglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_rgblock_t bno = rgbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); + error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < rgbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + + *rglenp = len; + trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len, + *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this rtgroup extent as possible. + * The number of blocks disposed of will be returned in @rglenp. + */ +STATIC int +xreap_rgextent_iter( + struct xreap_state *rs, + xfs_rgblock_t rgbno, + xfs_extlen_t *rglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_rtblock_t rtbno; + int error; + + /* + * The only caller so far is CoW fork repair, so we only know how to + * unlink or free CoW staging extents. Here we don't have to worry + * about invalidating buffers! + */ + if (rs->oinfo != &XFS_RMAP_OINFO_COW) { + ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW); + return -EFSCORRUPTED; + } + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the forward and reverse mapping and move on. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno, + *rglenp); + + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); + rs->deferred++; + return 0; + } + + trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); + + /* + * The CoW staging extent is not crosslinked. Use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); + error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL, + rs->resv, + XFS_FREE_EXTENT_REALTIME | + XFS_FREE_EXTENT_SKIP_DISCARD); + if (error) + return error; + + rs->deferred++; + return 0; +} + +#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +/* + * Break a rt file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * be aligned to a realtime extent. + */ +STATIC int +xreap_rtmeta_extent( + uint64_t rtbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno); + xfs_rgblock_t rgbno_next = rgbno + len; + int error = 0; + + ASSERT(sc->ip != NULL); + ASSERT(!sc->sr.rtg); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno)); + if (!sc->sr.rtg) + return -EFSCORRUPTED; + + xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL); + + while (rgbno < rgbno_next) { + xfs_extlen_t rglen; + bool crosslinked; + + error = xreap_rgextent_select(rs, rgbno, rgbno_next, + &crosslinked, &rglen); + if (error) + goto out_unlock; + + error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked); + if (error) + goto out_unlock; + + if (xreap_want_defer_finish(rs)) { + error = xfs_defer_finish(&sc->tp); + if (error) + goto out_unlock; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + if (error) + goto out_unlock; + xreap_reset(rs); + } + + rgbno += rglen; + } + +out_unlock: + xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL); + xfs_rtgroup_put(sc->sr.rtg); + sc->sr.rtg = NULL; + return error; +} + +/* + * Dispose of every block of every rt metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_rtblocks( + struct xfs_scrub *sc, + struct xrtb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} +#endif /* CONFIG_XFS_RT */ + +/* + * Dispose of every block of an old metadata btree that used to be rooted in a + * metadata directory file. + */ +int +xrep_reap_metadir_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap) +{ + /* + * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old + * blocks are no longer mapped by the inode, and inode metadata space + * reservations can only account freed space to the i_nblocks. + */ + struct xfs_owner_info oinfo; + struct xreap_state rs = { + .sc = sc, + .oinfo = &oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + ASSERT(xfs_is_metadir_inode(sc->ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return xrep_reset_metafile_resv(sc); +} + /* * Metadata files are not supposed to share blocks with anything else. * If blocks are shared, we remove the reverse mapping (thus reducing the @@ -730,7 +998,8 @@ xreap_bmapi_select( } imap->br_blockcount = len; - trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); + trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len, + *crosslinked); out_cur: xfs_btree_del_cursor(cur, error); return error; @@ -780,7 +1049,6 @@ xreap_bmapi_binval( xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; @@ -837,7 +1105,7 @@ xreap_bmapi_binval( */ while (bno < agbno_next) { struct xrep_bufscan scan = { - .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .daddr = xfs_agbno_to_daddr(pag, bno), .max_sectors = xrep_bufscan_max_sectors(mp, scan_blocks), .daddr_step = XFS_FSB_TO_BB(mp, 1), @@ -870,7 +1138,8 @@ xreap_bmapi_binval( } out: - trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); + trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno, + imap->br_blockcount); return 0; } @@ -897,7 +1166,7 @@ xrep_reap_bmapi_iter( * anybody else who thinks they own the block, even though that * runs the risk of stale buffer warnings in the future. */ - trace_xreap_dispose_unmap_extent(sc->sa.pag, + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), imap->br_blockcount); @@ -920,7 +1189,7 @@ xrep_reap_bmapi_iter( * by a block starting before the first block of the extent but overlap * anyway. */ - trace_xreap_dispose_free_extent(sc->sa.pag, + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), imap->br_blockcount); diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index 3f2f1775e29d..4c8f62701fb3 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -14,6 +14,15 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, const struct xfs_owner_info *oinfo); int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); +int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap); + +#ifdef CONFIG_XFS_RT +int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); +#else +# define xrep_reap_rtblocks(...) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ /* Buffer cache scan context. */ struct xrep_bufscan { diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index d0c7d4a29c0f..d46528023015 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -421,7 +421,7 @@ xchk_refcount_mergeable( if (r1->rc_refcount != r2->rc_refcount) return false; if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > - MAXREFCEXTLEN) + XFS_REFC_LEN_MAX) return false; return true; @@ -453,7 +453,8 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_refcount_check_irec(to_perag(bs->cur->bc_group), &irec) != + NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -490,7 +491,7 @@ xchk_refcount_xref_rmap( struct xfs_scrub *sc, xfs_filblks_t cow_blocks) { - xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t refcbt_blocks = 0; xfs_filblks_t blocks; int error; diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index a00d7ce7ae5b..9c8cb5332da0 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -183,13 +183,13 @@ xrep_refc_stash( if (xchk_should_terminate(sc, &error)) return error; - irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); error = xrep_refc_check_ext(rr->sc, &irec); if (error) return error; - trace_xrep_refc_found(sc->sa.pag, &irec); + trace_xrep_refc_found(pag_group(sc->sa.pag), &irec); return xfarray_append(rr->refcount_records, &irec); } @@ -215,7 +215,7 @@ xrep_refc_rmap_shareable( return false; /* Metadata in files are never shareable */ - if (xfs_internal_inum(mp, rmap->rm_owner)) + if (xfs_is_sb_inum(mp, rmap->rm_owner)) return false; /* Metadata and unwritten file blocks are not shareable. */ @@ -422,7 +422,7 @@ xrep_refc_find_refcounts( /* * Set up a bag to store all the rmap records that we're tracking to * generate a reference count record. If the size of the bag exceeds - * MAXREFCOUNT, we clamp rc_refcount. + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. */ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); if (error) @@ -590,7 +590,6 @@ xrep_refc_build_new_tree( struct xfs_scrub *sc = rr->sc; struct xfs_btree_cur *refc_cur; struct xfs_perag *pag = sc->sa.pag; - xfs_fsblock_t fsbno; int error; error = xrep_refc_sort_records(rr); @@ -603,8 +602,8 @@ xrep_refc_build_new_tree( * to root the new btree while it's under construction and before we * attach it to the AG header. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); - xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, + xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)), XFS_AG_RESV_METADATA); rr->new_btree.bload.get_records = xrep_refc_get_records; rr->new_btree.bload.claim_block = xrep_refc_claim_block; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 155bbaaa496e..f8f9ed30f56b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_rtbitmap.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" @@ -36,6 +37,13 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_dir2.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_rtalloc.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -61,6 +69,7 @@ xrep_attempt( trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); xchk_ag_btcur_free(&sc->sa); + xchk_rtgroup_btcur_free(&sc->sr); /* Repair whatever's broken. */ ASSERT(sc->ops->repair); @@ -305,7 +314,7 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { - aglen = pag->block_count; + aglen = pag_group(pag)->xg_block_count; freelen = aglen; usedlen = aglen; } else { @@ -325,16 +334,14 @@ xrep_calc_ag_resblks( /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || - aglen != pag->block_count || + aglen != pag_group(pag)->xg_block_count || freelen >= aglen) { - aglen = pag->block_count; + aglen = pag_group(pag)->xg_block_count; freelen = aglen; usedlen = aglen; } - xfs_perag_put(pag); - trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, - freelen, usedlen); + trace_xrep_calc_ag_resblks(pag, icount, aglen, freelen, usedlen); /* * Figure out how many blocks we'd need worst case to rebuild @@ -372,12 +379,48 @@ xrep_calc_ag_resblks( rmapbt_sz = 0; } - trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, - inobt_sz, rmapbt_sz, refcbt_sz); + trace_xrep_calc_ag_resblks_btsize(pag, bnobt_sz, inobt_sz, rmapbt_sz, + refcbt_sz); + xfs_perag_put(pag); return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); } +#ifdef CONFIG_XFS_RT +/* + * Figure out how many blocks to reserve for a rtgroup repair. We calculate + * the worst case estimate for the number of blocks we'd need to rebuild one of + * any type of per-rtgroup btree. + */ +xfs_extlen_t +xrep_calc_rtgroup_resblks( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_metadata *sm = sc->sm; + uint64_t usedlen; + xfs_extlen_t rmapbt_sz = 0; + + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return 0; + if (!xfs_has_rtgroups(mp)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + usedlen = xfs_rtbxlen_to_blen(mp, xfs_rtgroup_extents(mp, sm->sm_agno)); + ASSERT(usedlen <= XFS_MAX_RGBLOCKS); + + if (xfs_has_rmapbt(mp)) + rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen); + + trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen, + rmapbt_sz); + + return rmapbt_sz; +} +#endif /* CONFIG_XFS_RT */ + /* * Reconstructing per-AG Btrees * @@ -414,7 +457,7 @@ xrep_fix_freelist( args.mp = sc->mp; args.tp = sc->tp; - args.agno = sc->sa.pag->pag_agno; + args.agno = pag_agno(sc->sa.pag); args.alignment = 1; args.pag = sc->sa.pag; @@ -483,7 +526,7 @@ xrep_findroot_block( int block_level; int error = 0; - daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); + daddr = xfs_agbno_to_daddr(ri->sc->sa.pag, agbno); /* * Blocks in the AGFL have stale contents that might just happen to @@ -612,7 +655,7 @@ xrep_findroot_block( else fab->root = NULLAGBLOCK; - trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, + trace_xrep_findroot_block(ri->sc->sa.pag, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); out: xfs_trans_brelse(ri->sc->tp, bp); @@ -953,6 +996,83 @@ xrep_ag_init( return 0; } +#ifdef CONFIG_XFS_RT +/* Initialize all the btree cursors for a RT repair. */ +void +xrep_rtgroup_btcur_init( + struct xfs_scrub *sc, + struct xchk_rt *sr) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(sr->rtg != NULL); + + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT && + (sr->rtlock_flags & XFS_RTGLOCK_RMAP) && + xfs_has_rtrmapbt(mp)) + sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); + + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT && + (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) && + xfs_has_rtreflink(mp)) + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); +} + +/* + * Given a reference to a rtgroup structure, lock rtgroup btree inodes and + * create btree cursors. Must only be called to repair a regular rt file. + */ +int +xrep_rtgroup_init( + struct xfs_scrub *sc, + struct xfs_rtgroup *rtg, + struct xchk_rt *sr, + unsigned int rtglock_flags) +{ + ASSERT(sr->rtg == NULL); + + xfs_rtgroup_lock(rtg, rtglock_flags); + sr->rtlock_flags = rtglock_flags; + + /* Grab our own passive reference from the caller's ref. */ + sr->rtg = xfs_rtgroup_hold(rtg); + xrep_rtgroup_btcur_init(sc, sr); + return 0; +} + +/* Ensure that all rt blocks in the given range are not marked free. */ +int +xrep_require_rtext_inuse( + struct xfs_scrub *sc, + xfs_rgblock_t rgbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = sc->mp; + xfs_rtxnum_t startrtx; + xfs_rtxnum_t endrtx; + bool is_free = false; + int error = 0; + + if (xfs_has_zoned(mp)) { + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) + return -EFSCORRUPTED; + return 0; + } + + startrtx = xfs_rgbno_to_rtx(mp, rgbno); + endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); + + error = xfs_rtalloc_extent_is_free(sc->sr.rtg, sc->tp, startrtx, + endrtx - startrtx + 1, &is_free); + if (error) + return error; + if (is_free) + return -EFSCORRUPTED; + + return 0; +} +#endif /* CONFIG_XFS_RT */ + /* Reinitialize the per-AG block reservation for the AG we just fixed. */ int xrep_reset_perag_resv( @@ -973,7 +1093,7 @@ xrep_reset_perag_resv( if (error == -ENOSPC) { xfs_err(sc->mp, "Insufficient free space to reset per-AG reservation for AG %u after repair.", - sc->sa.pag->pag_agno); + pag_agno(sc->sa.pag)); error = 0; } @@ -1083,7 +1203,12 @@ xrep_metadata_inode_forks( if (error) return error; - /* Make sure the attr fork looks ok before we delete it. */ + /* + * Metadata files can only have extended attributes on metadir + * filesystems, either for parent pointers or for actual xattr data. + * For a non-metadir filesystem, make sure the attr fork looks ok + * before we delete it. + */ if (xfs_inode_hasattr(sc->ip)) { error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); if (error) @@ -1099,8 +1224,11 @@ xrep_metadata_inode_forks( return error; } - /* Clear the attr forks since metadata shouldn't have that. */ - if (xfs_inode_hasattr(sc->ip)) { + /* + * Metadata files on non-metadir filesystems cannot have attr forks, + * so clear them now. + */ + if (xfs_inode_hasattr(sc->ip) && !xfs_has_metadir(sc->mp)) { if (!dirty) { dirty = true; xfs_trans_ijoin(sc->tp, sc->ip, 0); @@ -1206,3 +1334,110 @@ xrep_buf_verify_struct( return fa == NULL; } + +/* Check the sanity of a rmap record for a metadata btree inode. */ +int +xrep_check_ino_btree_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + /* + * Metadata btree inodes never have extended attributes, and all blocks + * should have the bmbt block flag set. + */ + if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) || + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + return -EFSCORRUPTED; + + /* Make sure the block is within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* + * Reset the block count of the inode being repaired, and adjust the dquot + * block usage to match. The inode must not have an xattr fork. + */ +void +xrep_inode_set_nblocks( + struct xfs_scrub *sc, + int64_t new_blocks) +{ + int64_t delta = + new_blocks - sc->ip->i_nblocks; + + sc->ip->i_nblocks = new_blocks; + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + if (delta != 0) + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, + delta); +} + +/* Reset the block reservation for a metadata inode. */ +int +xrep_reset_metafile_resv( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + int64_t delta; + int error; + + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - + mp->m_metafile_resv_target; + if (delta == 0) + return 0; + + /* + * Too many blocks have been reserved, transfer some from the incore + * reservation back to the filesystem. + */ + if (delta > 0) { + int64_t give_back; + + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); + if (give_back > 0) { + xfs_mod_sb_delalloc(mp, -give_back); + xfs_add_fdblocks(mp, give_back); + mp->m_metafile_resv_avail -= give_back; + } + + return 0; + } + + /* + * Not enough reservation; try to take some blocks from the filesystem + * to the metabtree reservation. + */ + delta = -delta; /* delta is negative here, so invert the sign. */ + error = xfs_dec_fdblocks(mp, delta, true); + while (error == -ENOSPC) { + delta--; + if (delta == 0) { + xfs_warn(sc->mp, +"Insufficient free space to reset metabtree reservation after repair."); + return 0; + } + error = xfs_dec_fdblocks(mp, delta, true); + } + if (error) + return error; + + xfs_mod_sb_delalloc(mp, delta); + mp->m_metafile_resv_avail += delta; + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 0e0dc2bf985c..af0a3a9e5ed9 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -8,6 +8,7 @@ #include "xfs_quota_defs.h" +struct xfs_rtgroup; struct xchk_stats_run; static inline int xrep_notsupported(struct xfs_scrub *sc) @@ -49,7 +50,9 @@ xrep_trans_commit( struct xbitmap; struct xagb_bitmap; +struct xrgb_bitmap; struct xfsb_bitmap; +struct xrtb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); @@ -96,6 +99,8 @@ int xrep_setup_parent(struct xfs_scrub *sc); int xrep_setup_nlinks(struct xfs_scrub *sc); int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); int xrep_setup_dirtree(struct xfs_scrub *sc); +int xrep_setup_rtrmapbt(struct xfs_scrub *sc); +int xrep_setup_rtrefcountbt(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -106,6 +111,20 @@ int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, struct xchk_ag *sa); +#ifdef CONFIG_XFS_RT +int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg, + struct xchk_rt *sr, unsigned int rtglock_flags); +void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr); +int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_filblks_t len); +xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc); +#else +# define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS) +# define xrep_calc_rtgroup_resblks(sc) (0) +#endif /* CONFIG_XFS_RT */ + +int xrep_check_ino_btree_mapping(struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec); /* Metadata revalidators */ @@ -134,13 +153,20 @@ int xrep_directory(struct xfs_scrub *sc); int xrep_parent(struct xfs_scrub *sc); int xrep_symlink(struct xfs_scrub *sc); int xrep_dirtree(struct xfs_scrub *sc); +int xrep_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); int xrep_rtsummary(struct xfs_scrub *sc); +int xrep_rgsuperblock(struct xfs_scrub *sc); +int xrep_rtrmapbt(struct xfs_scrub *sc); +int xrep_rtrefcountbt(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported # define xrep_rtsummary xrep_notsupported +# define xrep_rgsuperblock xrep_notsupported +# define xrep_rtrmapbt xrep_notsupported +# define xrep_rtrefcountbt xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -159,11 +185,22 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); +int xrep_reset_metafile_resv(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) -#define xrep_will_attempt(sc) (false) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} static inline int xrep_attempt( @@ -182,6 +219,8 @@ xrep_calc_ag_resblks( return 0; } +#define xrep_calc_rtgroup_resblks xrep_calc_ag_resblks + static inline int xrep_reset_perag_resv( struct xfs_scrub *sc) @@ -208,6 +247,9 @@ xrep_setup_nothing( #define xrep_setup_parent xrep_setup_nothing #define xrep_setup_nlinks xrep_setup_nothing #define xrep_setup_dirtree xrep_setup_nothing +#define xrep_setup_metapath xrep_setup_nothing +#define xrep_setup_rtrmapbt xrep_setup_nothing +#define xrep_setup_rtrefcountbt xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) @@ -243,6 +285,10 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) #define xrep_parent xrep_notsupported #define xrep_symlink xrep_notsupported #define xrep_dirtree xrep_notsupported +#define xrep_metapath xrep_notsupported +#define xrep_rgsuperblock xrep_notsupported +#define xrep_rtrmapbt xrep_notsupported +#define xrep_rtrefcountbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h new file mode 100644 index 000000000000..4c3126b66dcb --- /dev/null +++ b/fs/xfs/scrub/rgb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RGB_BITMAP_H__ +#define __XFS_SCRUB_RGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_rgblock_t */ + +struct xrgb_bitmap { + struct xbitmap32 rgbitmap; +}; + +static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->rgbitmap); +} + +static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->rgbitmap); +} + +static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap, + xfs_rgblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->rgbitmap, start, len); +} + +static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->rgbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c new file mode 100644 index 000000000000..d189732d0e24 --- /dev/null +++ b/fs/xfs/scrub/rgsuper.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_rtgroup.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_rmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" + +/* Set us up with a transaction and an empty context. */ +int +xchk_setup_rgsuperblock( + struct xfs_scrub *sc) +{ + return xchk_trans_alloc(sc, 0); +} + +/* Cross-reference with the other rt metadata. */ +STATIC void +xchk_rgsuperblock_xref( + struct xfs_scrub *sc) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1); + xchk_xref_is_only_rt_owned_by(sc, 0, 1, &XFS_RMAP_OINFO_FS); +} + +int +xchk_rgsuperblock( + struct xfs_scrub *sc) +{ + xfs_rgnumber_t rgno = sc->sm->sm_agno; + int error; + + /* + * Only rtgroup 0 has a superblock. We may someday want to use higher + * rgno for other functions, similar to what we do with the primary + * super scrub function. + */ + if (rgno != 0) + return -ENOENT; + + /* + * Grab an active reference to the rtgroup structure. If we can't get + * it, we're racing with something that's tearing down the group, so + * signal that the group no longer exists. Take the rtbitmap in shared + * mode so that the group can't change while we're doing things. + */ + error = xchk_rtgroup_init_existing(sc, rgno, &sc->sr); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED); + if (error) + return error; + + /* + * Since we already validated the rt superblock at mount time, we don't + * need to check its contents again. All we need is to cross-reference. + */ + xchk_rgsuperblock_xref(sc); + return 0; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int +xrep_rgsuperblock( + struct xfs_scrub *sc) +{ + ASSERT(rtg_rgno(sc->sr.rtg) == 0); + + xfs_log_sb(sc->tp); + return 0; +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index ba5bbc3fb754..39e9ad7cd8ae 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -358,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + xfs_rmap_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -410,7 +410,7 @@ xchk_rmapbt_walk_ag_metadata( goto out; /* OWN_LOG: Internal log */ - if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + if (xfs_ag_contains_log(mp, pag_agno(sc->sa.pag))) { error = xagb_bitmap_set(&cr->log_owned, XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), mp->m_sb.sb_logblocks); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index e8080eba37d2..f5f73078ffe2 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -31,6 +31,9 @@ #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -231,7 +234,7 @@ xrep_rmap_stash( if (xchk_iscan_aborted(&rr->iscan)) return -EFSCORRUPTED; - trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + trace_xrep_rmap_found(sc->sa.pag, &rmap); mutex_lock(&rr->lock); mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); @@ -344,7 +347,7 @@ xrep_rmap_visit_bmbt( int error; if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != - rf->rr->sc->sa.pag->pag_agno) + pag_agno(rf->rr->sc->sa.pag)) return 0; agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); @@ -391,7 +394,7 @@ xrep_rmap_visit_iroot_btree_block( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag)) return 0; agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -499,6 +502,69 @@ xrep_rmap_scan_iext( return xrep_rmap_stash_accumulated(rf); } +static int +xrep_rmap_scan_meta_btree( + struct xrep_rmap_ifork *rf, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = rf->rr->sc; + struct xfs_rtgroup *rtg = NULL; + struct xfs_btree_cur *cur = NULL; + enum xfs_rtg_inodes type; + int error; + + if (rf->whichfork != XFS_DATA_FORK) + return -EFSCORRUPTED; + + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + type = XFS_RTGI_RMAP; + break; + case XFS_METAFILE_RTREFCOUNT: + type = XFS_RTGI_REFCOUNT; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { + if (ip == rtg->rtg_inodes[type]) + goto found; + } + + /* + * We should never find an rt metadata btree inode that isn't + * associated with an rtgroup yet has ondisk blocks allocated to it. + */ + if (ip->i_nblocks) { + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; + +found: + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); + break; + case XFS_METAFILE_RTREFCOUNT: + cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + goto out_rtg; + } + + error = xrep_rmap_scan_iroot_btree(rf, cur); + xfs_btree_del_cursor(cur, error); +out_rtg: + xfs_rtgroup_rele(rtg); + return error; +} + /* Find all the extents from a given AG in an inode fork. */ STATIC int xrep_rmap_scan_ifork( @@ -512,14 +578,14 @@ xrep_rmap_scan_ifork( .whichfork = whichfork, }; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + bool mappings_done; int error = 0; if (!ifp) return 0; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { - bool mappings_done; - + switch (ifp->if_format) { + case XFS_DINODE_FMT_BTREE: /* * Scan the bmap btree for data device mappings. This includes * the btree blocks themselves, even if this is a realtime @@ -528,15 +594,18 @@ xrep_rmap_scan_ifork( error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); if (error || mappings_done) return error; - } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { - return 0; + fallthrough; + case XFS_DINODE_FMT_EXTENTS: + /* Scan incore extent cache if this isn't a realtime file. */ + if (xfs_ifork_is_realtime(ip, whichfork)) + return 0; + + return xrep_rmap_scan_iext(&rf, ifp); + case XFS_DINODE_FMT_META_BTREE: + return xrep_rmap_scan_meta_btree(&rf, ip); } - /* Scan incore extent cache if this isn't a realtime file. */ - if (xfs_ifork_is_realtime(ip, whichfork)) - return 0; - - return xrep_rmap_scan_iext(&rf, ifp); + return 0; } /* @@ -622,7 +691,7 @@ xrep_rmap_walk_inobt( return error; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL) return -EFSCORRUPTED; agino = irec.ir_startino; @@ -801,7 +870,7 @@ xrep_rmap_find_log_rmaps( { struct xfs_scrub *sc = rr->sc; - if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag))) return 0; return xrep_rmap_stash(rr, @@ -976,7 +1045,7 @@ xrep_rmap_try_reserve( { struct xrep_rmap_agfl ra = { .bitmap = freesp_blocks, - .agno = rr->sc->sa.pag->pag_agno, + .agno = pag_agno(rr->sc->sa.pag), }; struct xfs_scrub *sc = rr->sc; struct xrep_newbt_resv *resv, *n; @@ -1272,7 +1341,6 @@ xrep_rmap_build_new_tree( struct xfs_perag *pag = sc->sa.pag; struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_btree_cur *rmap_cur; - xfs_fsblock_t fsbno; int error; /* @@ -1290,9 +1358,9 @@ xrep_rmap_build_new_tree( * rmapbt per-AG reservation, which we will adjust further after * committing the new btree. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, - fsbno, XFS_AG_RESV_RMAPBT); + xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)), + XFS_AG_RESV_RMAPBT); rr->new_btree.bload.get_records = xrep_rmap_get_records; rr->new_btree.bload.claim_block = xrep_rmap_claim_block; rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; @@ -1553,7 +1621,7 @@ xrep_rmapbt_live_update( if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) goto out_unlock; - trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); if (error) @@ -1597,7 +1665,7 @@ xrep_rmap_setup_scan( /* Set up in-memory rmap btree */ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, - sc->sa.pag->pag_agno); + pag_agno(sc->sa.pag)); if (error) goto out_mutex; @@ -1612,7 +1680,7 @@ xrep_rmap_setup_scan( */ ASSERT(sc->flags & XCHK_FSGATES_RMAP); xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); - error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook); if (error) goto out_iscan; return 0; @@ -1633,7 +1701,7 @@ xrep_rmap_teardown( struct xfs_scrub *sc = rr->sc; xchk_iscan_abort(&rr->iscan); - xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); + xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook); xchk_iscan_teardown(&rr->iscan); xfbtree_destroy(&rr->rmap_btree); mutex_destroy(&rr->lock); diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h new file mode 100644 index 000000000000..1313ef605511 --- /dev/null +++ b/fs/xfs/scrub/rtb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTB_BITMAP_H__ +#define __XFS_SCRUB_RTB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_rtblock_t */ + +struct xrtb_bitmap { + struct xbitmap64 rtbitmap; +}; + +static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->rtbitmap); +} + +static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->rtbitmap); +} + +static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap, + xfs_rtblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->rtbitmap, start, len); +} + +static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->rtbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_RTB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 46583517377f..d5ff8609dbfb 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -9,17 +9,25 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_btree.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bit.h" +#include "xfs_rtgroup.h" #include "xfs_sb.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" +#include "scrub/tempexch.h" #include "scrub/rtbitmap.h" +#include "scrub/btree.h" /* Set us up with the realtime metadata locked. */ int @@ -30,10 +38,19 @@ xchk_setup_rtbitmap( struct xchk_rtbitmap *rtb; int error; - rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + rtb = kzalloc(struct_size(rtb, words, xchk_rtbitmap_wordcnt(sc)), + XCHK_GFP_FLAGS); if (!rtb) return -ENOMEM; sc->buf = rtb; + rtb->sc = sc; + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; if (xchk_could_repair(sc)) { error = xrep_setup_rtbitmap(sc, rtb); @@ -45,7 +62,7 @@ xchk_setup_rtbitmap( if (error) return error; - error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + error = xchk_install_live_inode(sc, rtg_bitmap(sc->sr.rtg)); if (error) return error; @@ -53,7 +70,9 @@ xchk_setup_rtbitmap( if (error) return error; - xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); + if (error) + return error; /* * Now that we've locked the rtbitmap, we can't race with growfsrt @@ -61,32 +80,65 @@ xchk_setup_rtbitmap( * Hence it is safe to compute and check the geometry values. */ if (mp->m_sb.sb_rblocks) { - rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); rtb->rextslog = xfs_compute_rextslog(rtb->rextents); - rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp); } + return 0; } -/* Realtime bitmap. */ +/* Per-rtgroup bitmap contents. */ + +/* Cross-reference rtbitmap entries with other metadata. */ +STATIC void +xchk_rtbitmap_xref( + struct xchk_rtbitmap *rtb, + xfs_rtblock_t startblock, + xfs_rtblock_t blockcount) +{ + struct xfs_scrub *sc = rtb->sc; + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, startblock); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + if (!sc->sr.rmap_cur) + return; + + xchk_xref_has_no_rt_owner(sc, rgbno, blockcount); + xchk_xref_is_not_rt_shared(sc, rgbno, blockcount); + xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount); + + if (rtb->next_free_rgbno < rgbno) + xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, + rgbno - rtb->next_free_rgbno); + rtb->next_free_rgbno = rgbno + blockcount; +} /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_scrub *sc = priv; + struct xchk_rtbitmap *rtb = priv; + struct xfs_scrub *sc = rtb->sc; xfs_rtblock_t startblock; xfs_filblks_t blockcount; - startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); - blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); + startblock = xfs_rtx_to_rtb(rtg, rec->ar_startext); + blockcount = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); - if (!xfs_verify_rtbext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + + xchk_rtbitmap_xref(rtb, startblock, blockcount); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return 0; } @@ -134,24 +186,27 @@ xchk_rtbitmap_check_extents( return error; } -/* Scrub the realtime bitmap. */ +/* Scrub this group's realtime bitmap. */ int xchk_rtbitmap( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg_bitmap(rtg); struct xchk_rtbitmap *rtb = sc->buf; + xfs_rgblock_t last_rgbno; int error; /* Is sb_rextents correct? */ if (mp->m_sb.sb_rextents != rtb->rextents) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } /* Is sb_rextslog correct? */ if (mp->m_sb.sb_rextslog != rtb->rextslog) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -160,17 +215,17 @@ xchk_rtbitmap( * case can we exceed 4bn bitmap blocks since the super field is a u32. */ if (rtb->rbmblocks > U32_MAX) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } /* The bitmap file length must be aligned to an fsblock. */ - if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + if (rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -179,8 +234,8 @@ xchk_rtbitmap( * growfsrt expands the bitmap file before updating sb_rextents, so the * file can be larger than sb_rbmblocks. */ - if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + if (rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -193,10 +248,20 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); + rtb->next_free_rgbno = 0; + error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, rtb); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; + /* + * Check that the are rmappings for all rt extents between the end of + * the last free extent we saw and the last possible extent in the rt + * group. + */ + last_rgbno = rtg->rtg_extents * mp->m_sb.sb_rextsize - 1; + if (rtb->next_free_rgbno < last_rgbno) + xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, + last_rgbno - rtb->next_free_rgbno); return 0; } @@ -207,6 +272,7 @@ xchk_xref_is_used_rt_space( xfs_rtblock_t rtbno, xfs_extlen_t len) { + struct xfs_rtgroup *rtg = sc->sr.rtg; xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -215,15 +281,19 @@ xchk_xref_is_used_rt_space( if (xchk_skip_xref(sc->sm)) return; + if (xfs_has_zoned(sc->mp)) { + if (!xfs_zone_rgbno_is_valid(rtg, + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); + return; + } + startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) - goto out_unlock; + return; if (is_free) - xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino); -out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); } diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h index 85304ff019e1..fe52b877253d 100644 --- a/fs/xfs/scrub/rtbitmap.h +++ b/fs/xfs/scrub/rtbitmap.h @@ -6,17 +6,72 @@ #ifndef __XFS_SCRUB_RTBITMAP_H__ #define __XFS_SCRUB_RTBITMAP_H__ +/* + * We use an xfile to construct new bitmap blocks for the portion of the + * rtbitmap file that we're replacing. Whereas the ondisk bitmap must be + * accessed through the buffer cache, the xfile bitmap supports direct + * word-level accesses. Therefore, we create a small abstraction for linear + * access. + */ +typedef unsigned long long xrep_wordoff_t; +typedef unsigned int xrep_wordcnt_t; + +/* Mask to round an rtx down to the nearest bitmap word. */ +#define XREP_RTBMP_WORDMASK ((1ULL << XFS_NBWORDLOG) - 1) + + struct xchk_rtbitmap { + struct xfs_scrub *sc; + uint64_t rextents; uint64_t rbmblocks; unsigned int rextslog; unsigned int resblks; + + /* The next free rt group block number that we expect to see. */ + xfs_rgblock_t next_free_rgbno; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* stuff for staging a new bitmap */ + struct xfs_rtalloc_args args; + struct xrep_tempexch tempexch; +#endif + + /* The next rtgroup block we expect to see during our rtrmapbt walk. */ + xfs_rgblock_t next_rgbno; + + /* rtgroup lock flags */ + unsigned int rtglock_flags; + + /* rtword position of xfile as we write buffers to disk. */ + xrep_wordoff_t prep_wordoff; + + /* In-Memory rtbitmap for repair. */ + union xfs_rtword_raw words[]; }; #ifdef CONFIG_XFS_ONLINE_REPAIR int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); + +/* + * How big should the words[] buffer be? + * + * For repairs, we want a full fsblock worth of space so that we can memcpy a + * buffer full of 1s into the xfile bitmap. The xfile bitmap doesn't have + * rtbitmap block headers, so we don't use blockwsize. Scrub doesn't use the + * words buffer at all. + */ +static inline unsigned int +xchk_rtbitmap_wordcnt( + struct xfs_scrub *sc) +{ + if (xchk_could_repair(sc)) + return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG; + return 0; +} #else # define xrep_setup_rtbitmap(sc, rtb) (0) +# define xchk_rtbitmap_wordcnt(sc) (0) #endif /* CONFIG_XFS_ONLINE_REPAIR */ #endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c index 0fef98e9f834..203a1a97c502 100644 --- a/fs/xfs/scrub/rtbitmap_repair.c +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -12,32 +12,66 @@ #include "xfs_btree.h" #include "xfs_log_format.h" #include "xfs_trans.h" +#include "xfs_rtalloc.h" #include "xfs_inode.h" #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_extent_busy.h" +#include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/xfile.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" #include "scrub/rtbitmap.h" -/* Set up to repair the realtime bitmap file metadata. */ +/* rt bitmap content repairs */ + +/* Set up to repair the realtime bitmap for this group. */ int xrep_setup_rtbitmap( struct xfs_scrub *sc, struct xchk_rtbitmap *rtb) { struct xfs_mount *mp = sc->mp; - unsigned long long blocks = 0; + char *descr; + unsigned long long blocks = mp->m_sb.sb_rbmblocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* Create an xfile to hold our reconstructed bitmap. */ + descr = xchk_xfile_rtgroup_descr(sc, "bitmap file"); + error = xfile_create(descr, blocks * mp->m_sb.sb_blocksize, &sc->xfile); + kfree(descr); + if (error) + return error; /* - * Reserve enough blocks to write out a completely new bmbt for a - * maximally fragmented bitmap file. We do not hold the rtbitmap - * ILOCK yet, so this is entirely speculative. + * Reserve enough blocks to write out a completely new bitmap file, + * plus twice as many blocks as we would need if we can only allocate + * one block per data fork mapping. This should cover the + * preallocation of the temporary file and exchanging the extent + * mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement bitmap and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtbitmap ILOCK) and cannot ask for more reservation. */ - blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; if (blocks > UINT_MAX) return -EOPNOTSUPP; @@ -45,6 +79,325 @@ xrep_setup_rtbitmap( return 0; } +static inline xrep_wordoff_t +rtx_to_wordoff( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return rtx >> XFS_NBWORDLOG; +} + +static inline xrep_wordcnt_t +rtxlen_to_wordcnt( + xfs_rtxlen_t rtxlen) +{ + return rtxlen >> XFS_NBWORDLOG; +} + +/* Helper functions to record rtwords in an xfile. */ + +static inline int +xfbmp_load( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + xfs_rtword_t *word) +{ + union xfs_rtword_raw urk; + int error; + + ASSERT(xfs_has_rtgroups(rtb->sc->mp)); + + error = xfile_load(rtb->sc->xfile, &urk, + sizeof(union xfs_rtword_raw), + wordoff << XFS_WORDLOG); + if (error) + return error; + + *word = be32_to_cpu(urk.rtg); + return 0; +} + +static inline int +xfbmp_store( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + const xfs_rtword_t word) +{ + union xfs_rtword_raw urk; + + ASSERT(xfs_has_rtgroups(rtb->sc->mp)); + + urk.rtg = cpu_to_be32(word); + return xfile_store(rtb->sc->xfile, &urk, + sizeof(union xfs_rtword_raw), + wordoff << XFS_WORDLOG); +} + +static inline int +xfbmp_copyin( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + const union xfs_rtword_raw *word, + xrep_wordcnt_t nr_words) +{ + return xfile_store(rtb->sc->xfile, word, nr_words << XFS_WORDLOG, + wordoff << XFS_WORDLOG); +} + +static inline int +xfbmp_copyout( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + union xfs_rtword_raw *word, + xrep_wordcnt_t nr_words) +{ + return xfile_load(rtb->sc->xfile, word, nr_words << XFS_WORDLOG, + wordoff << XFS_WORDLOG); +} + +/* Perform a logical OR operation on an rtword in the incore bitmap. */ +static int +xrep_rtbitmap_or( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + xfs_rtword_t mask) +{ + xfs_rtword_t word; + int error; + + error = xfbmp_load(rtb, wordoff, &word); + if (error) + return error; + + trace_xrep_rtbitmap_or(rtb->sc->mp, wordoff, mask, word); + + return xfbmp_store(rtb, wordoff, word | mask); +} + +/* + * Mark as free every rt extent between the next rt block we expected to see + * in the rtrmap records and the given rt block. + */ +STATIC int +xrep_rtbitmap_mark_free( + struct xchk_rtbitmap *rtb, + xfs_rgblock_t rgbno) +{ + struct xfs_mount *mp = rtb->sc->mp; + struct xchk_rt *sr = &rtb->sc->sr; + struct xfs_rtgroup *rtg = sr->rtg; + xfs_rtxnum_t startrtx; + xfs_rtxnum_t nextrtx; + xrep_wordoff_t wordoff, nextwordoff; + unsigned int bit; + unsigned int bufwsize; + xfs_extlen_t mod; + xfs_rtword_t mask; + enum xbtree_recpacking outcome; + int error; + + if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno)) + return -EFSCORRUPTED; + + /* + * Convert rt blocks to rt extents The block range we find must be + * aligned to an rtextent boundary on both ends. + */ + startrtx = xfs_rgbno_to_rtx(mp, rtb->next_rgbno); + mod = xfs_rgbno_to_rtxoff(mp, rtb->next_rgbno); + if (mod) + return -EFSCORRUPTED; + + nextrtx = xfs_rgbno_to_rtx(mp, rgbno - 1) + 1; + mod = xfs_rgbno_to_rtxoff(mp, rgbno - 1); + if (mod != mp->m_sb.sb_rextsize - 1) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sr->refc_cur) { + error = xfs_refcount_has_records(sr->refc_cur, + XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno, + rgbno - rtb->next_rgbno, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sr->refc_cur, + XFS_REFC_DOMAIN_COW, rtb->next_rgbno, + rgbno - rtb->next_rgbno, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1); + + /* Set bits as needed to round startrtx up to the nearest word. */ + bit = startrtx & XREP_RTBMP_WORDMASK; + if (bit) { + xfs_rtblock_t len = nextrtx - startrtx; + unsigned int lastbit; + + lastbit = min(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + + error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, startrtx), + mask); + if (error || lastbit - bit == len) + return error; + startrtx += XFS_NBWORD - bit; + } + + /* Set bits as needed to round nextrtx down to the nearest word. */ + bit = nextrtx & XREP_RTBMP_WORDMASK; + if (bit) { + mask = ((xfs_rtword_t)1 << bit) - 1; + + error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, nextrtx), + mask); + if (error || startrtx + bit == nextrtx) + return error; + nextrtx -= bit; + } + + trace_xrep_rtbitmap_record_free_bulk(mp, startrtx, nextrtx - 1); + + /* Set all the words in between, up to a whole fs block at once. */ + wordoff = rtx_to_wordoff(mp, startrtx); + nextwordoff = rtx_to_wordoff(mp, nextrtx); + bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG; + + while (wordoff < nextwordoff) { + xrep_wordoff_t rem; + xrep_wordcnt_t wordcnt; + + wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff, + bufwsize); + + /* + * Try to keep us aligned to the rtwords buffer to reduce the + * number of xfile writes. + */ + rem = wordoff & (bufwsize - 1); + if (rem) + wordcnt = min_t(xrep_wordcnt_t, wordcnt, + bufwsize - rem); + + error = xfbmp_copyin(rtb, wordoff, rtb->words, wordcnt); + if (error) + return error; + + wordoff += wordcnt; + } + + return 0; +} + +/* Set free space in the rtbitmap based on rtrmapbt records. */ +STATIC int +xrep_rtbitmap_walk_rtrmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_rtbitmap *rtb = priv; + int error = 0; + + if (xchk_should_terminate(rtb->sc, &error)) + return error; + + if (rtb->next_rgbno < rec->rm_startblock) { + error = xrep_rtbitmap_mark_free(rtb, rec->rm_startblock); + if (error) + return error; + } + + rtb->next_rgbno = max(rtb->next_rgbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* + * Walk the rtrmapbt to find all the gaps between records, and mark the gaps + * in the realtime bitmap that we're computing. + */ +STATIC int +xrep_rtbitmap_find_freespace( + struct xchk_rtbitmap *rtb) +{ + struct xfs_scrub *sc = rtb->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + uint64_t blockcount; + int error; + + /* Prepare a buffer of ones so that we can accelerate bulk setting. */ + memset(rtb->words, 0xFF, mp->m_sb.sb_blocksize); + + xrep_rtgroup_btcur_init(sc, &sc->sr); + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rtbitmap_walk_rtrmap, + rtb); + if (error) + goto out; + + /* + * Mark as free every possible rt extent from the last one we saw to + * the end of the rt group. + */ + blockcount = rtg->rtg_extents * mp->m_sb.sb_rextsize; + if (rtb->next_rgbno < blockcount) { + error = xrep_rtbitmap_mark_free(rtb, blockcount); + if (error) + goto out; + } + +out: + xchk_rtgroup_btcur_free(&sc->sr); + return error; +} + +static int +xrep_rtbitmap_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtbitmap *rtb = data; + struct xfs_mount *mp = sc->mp; + union xfs_rtword_raw *ondisk; + int error; + + rtb->args.mp = sc->mp; + rtb->args.tp = sc->tp; + rtb->args.rbmbp = bp; + ondisk = xfs_rbmblock_wordptr(&rtb->args, 0); + rtb->args.rbmbp = NULL; + + error = xfbmp_copyout(rtb, rtb->prep_wordoff, ondisk, + mp->m_blockwsize); + if (error) + return error; + + if (xfs_has_rtgroups(sc->mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); + hdr->rt_owner = cpu_to_be64(sc->ip->i_ino); + hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid); + bp->b_ops = &xfs_rtbitmap_buf_ops; + } else { + bp->b_ops = &xfs_rtbuf_ops; + } + + rtb->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF); + return 0; +} + /* * Make sure that the given range of the data fork of the realtime file is * mapped to written blocks. The caller must ensure that the inode is joined @@ -160,9 +513,18 @@ xrep_rtbitmap( { struct xchk_rtbitmap *rtb = sc->buf; struct xfs_mount *mp = sc->mp; + struct xfs_group *xg = rtg_group(sc->sr.rtg); unsigned long long blocks = 0; + unsigned int busy_gen; int error; + /* We require the realtime rmapbt to rebuild anything. */ + if (!xfs_has_rtrmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + /* Impossibly large rtbitmap means we can't touch the filesystem. */ if (rtb->rbmblocks > U32_MAX) return 0; @@ -195,6 +557,79 @@ xrep_rtbitmap( if (error) return error; - /* Fix inconsistent bitmap geometry */ - return xrep_rtbitmap_geometry(sc, rtb); + /* + * Fix inconsistent bitmap geometry. This function returns with a + * clean scrub transaction. + */ + error = xrep_rtbitmap_geometry(sc, rtb); + if (error) + return error; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + if (!xfs_extent_busy_list_empty(xg, &busy_gen)) { + error = xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); + if (error) + return error; + } + + /* + * Generate the new rtbitmap data. We don't need the rtbmp information + * once this call is finished. + */ + error = xrep_rtbitmap_find_freespace(rtb); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtbitmap file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* + * Make sure we have space allocated for the part of the bitmap + * file that corresponds to this group. We already joined sc->ip. + */ + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rtb->rbmblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the bitmap file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rtb->rbmblocks, + xrep_rtbitmap_prep_buf, rtb); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)); + if (error) + return error; + + /* + * Now exchange the data fork contents. We're done with the temporary + * buffer, so we can reuse it for the tempfile exchmaps information. + */ + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0, + rtb->rbmblocks, &rtb->tempexch); + if (error) + return error; + + error = xrep_tempexch_contents(sc, &rtb->tempexch); + if (error) + return error; + + /* Free the old rtbitmap blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); } diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c new file mode 100644 index 000000000000..4c5dffc73641 --- /dev/null +++ b/fs/xfs/scrub/rtrefcount.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_rtalloc.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/repair.h" + +/* Set us up with the realtime refcount metadata locked. */ +int +xchk_setup_rtrefcountbt( + struct xfs_scrub *sc) +{ + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtrefcountbt(sc); + if (error) + return error; + } + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + + error = xchk_setup_rt(sc); + if (error) + return error; + + error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg)); + if (error) + return error; + + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); +} + +/* Realtime Reference count btree scrubber. */ + +/* + * Confirming Reference Counts via Reverse Mappings + * + * We want to count the reverse mappings overlapping a refcount record + * (bno, len, refcount), allowing for the possibility that some of the + * overlap may come from smaller adjoining reverse mappings, while some + * comes from single extents which overlap the range entirely. The + * outer loop is as follows: + * + * 1. For all reverse mappings overlapping the refcount extent, + * a. If a given rmap completely overlaps, mark it as seen. + * b. Otherwise, record the fragment (in agbno order) for later + * processing. + * + * Once we've seen all the rmaps, we know that for all blocks in the + * refcount record we want to find $refcount owners and we've already + * visited $seen extents that overlap all the blocks. Therefore, we + * need to find ($refcount - $seen) owners for every block in the + * extent; call that quantity $target_nr. Proceed as follows: + * + * 2. Pull the first $target_nr fragments from the list; all of them + * should start at or before the start of the extent. + * Call this subset of fragments the working set. + * 3. Until there are no more unprocessed fragments, + * a. Find the shortest fragments in the set and remove them. + * b. Note the block number of the end of these fragments. + * c. Pull the same number of fragments from the list. All of these + * fragments should start at the block number recorded in the + * previous step. + * d. Put those fragments in the set. + * 4. Check that there are $target_nr fragments remaining in the list, + * and that they all end at or beyond the end of the refcount extent. + * + * If the refcount is correct, all the check conditions in the algorithm + * should always hold true. If not, the refcount is incorrect. + */ +struct xchk_rtrefcnt_frag { + struct list_head list; + struct xfs_rmap_irec rm; +}; + +struct xchk_rtrefcnt_check { + struct xfs_scrub *sc; + struct list_head fragments; + + /* refcount extent we're examining */ + xfs_rgblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + + /* number of owners seen */ + xfs_nlink_t seen; +}; + +/* + * Decide if the given rmap is large enough that we can redeem it + * towards refcount verification now, or if it's a fragment, in + * which case we'll hang onto it in the hopes that we'll later + * discover that we've collected exactly the correct number of + * fragments as the rtrefcountbt says we should have. + */ +STATIC int +xchk_rtrefcountbt_rmap_check( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_rtrefcnt_check *refchk = priv; + struct xchk_rtrefcnt_frag *frag; + xfs_rgblock_t rm_last; + xfs_rgblock_t rc_last; + int error = 0; + + if (xchk_should_terminate(refchk->sc, &error)) + return error; + + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; + rc_last = refchk->bno + refchk->len - 1; + + /* Confirm that a single-owner refc extent is a CoW stage. */ + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { + xchk_btree_xref_set_corrupt(refchk->sc, cur, 0); + return 0; + } + + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { + /* + * The rmap overlaps the refcount record, so we can confirm + * one refcount owner seen. + */ + refchk->seen++; + } else { + /* + * This rmap covers only part of the refcount record, so + * save the fragment for later processing. If the rmapbt + * is healthy each rmap_irec we see will be in agbno order + * so we don't need insertion sort here. + */ + frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag), + XCHK_GFP_FLAGS); + if (!frag) + return -ENOMEM; + memcpy(&frag->rm, rec, sizeof(frag->rm)); + list_add_tail(&frag->list, &refchk->fragments); + } + + return 0; +} + +/* + * Given a bunch of rmap fragments, iterate through them, keeping + * a running tally of the refcount. If this ever deviates from + * what we expect (which is the rtrefcountbt's refcount minus the + * number of extents that totally covered the rtrefcountbt extent), + * we have a rtrefcountbt error. + */ +STATIC void +xchk_rtrefcountbt_process_rmap_fragments( + struct xchk_rtrefcnt_check *refchk) +{ + struct list_head worklist; + struct xchk_rtrefcnt_frag *frag; + struct xchk_rtrefcnt_frag *n; + xfs_rgblock_t bno; + xfs_rgblock_t rbno; + xfs_rgblock_t next_rbno; + xfs_nlink_t nr; + xfs_nlink_t target_nr; + + target_nr = refchk->refcount - refchk->seen; + if (target_nr == 0) + return; + + /* + * There are (refchk->rc.rc_refcount - refchk->nr refcount) + * references we haven't found yet. Pull that many off the + * fragment list and figure out where the smallest rmap ends + * (and therefore the next rmap should start). All the rmaps + * we pull off should start at or before the beginning of the + * refcount record's range. + */ + INIT_LIST_HEAD(&worklist); + rbno = NULLRGBLOCK; + + /* Make sure the fragments actually /are/ in bno order. */ + bno = 0; + list_for_each_entry(frag, &refchk->fragments, list) { + if (frag->rm.rm_startblock < bno) + goto done; + bno = frag->rm.rm_startblock; + } + + /* + * Find all the rmaps that start at or before the refc extent, + * and put them on the worklist. + */ + nr = 0; + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) + break; + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno < rbno) + rbno = bno; + list_move_tail(&frag->list, &worklist); + nr++; + } + + /* + * We should have found exactly $target_nr rmap fragments starting + * at or before the refcount extent. + */ + if (nr != target_nr) + goto done; + + while (!list_empty(&refchk->fragments)) { + /* Discard any fragments ending at rbno from the worklist. */ + nr = 0; + next_rbno = NULLRGBLOCK; + list_for_each_entry_safe(frag, n, &worklist, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno != rbno) { + if (bno < next_rbno) + next_rbno = bno; + continue; + } + list_del(&frag->list); + kfree(frag); + nr++; + } + + /* Try to add nr rmaps starting at rbno to the worklist. */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (frag->rm.rm_startblock != rbno) + goto done; + list_move_tail(&frag->list, &worklist); + if (next_rbno > bno) + next_rbno = bno; + nr--; + if (nr == 0) + break; + } + + /* + * If we get here and nr > 0, this means that we added fewer + * items to the worklist than we discarded because the fragment + * list ran out of items. Therefore, we cannot maintain the + * required refcount. Something is wrong, so we're done. + */ + if (nr) + goto done; + + rbno = next_rbno; + } + + /* + * Make sure the last extent we processed ends at or beyond + * the end of the refcount extent. + */ + if (rbno < refchk->bno + refchk->len) + goto done; + + /* Actually record us having seen the remaining refcount. */ + refchk->seen = refchk->refcount; +done: + /* Delete fragments and work list. */ + list_for_each_entry_safe(frag, n, &worklist, list) { + list_del(&frag->list); + kfree(frag); + } + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Use the rmap entries covering this extent to verify the refcount. */ +STATIC void +xchk_rtrefcountbt_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + struct xchk_rtrefcnt_check refchk = { + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, + .seen = 0, + }; + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + struct xchk_rtrefcnt_frag *frag; + struct xchk_rtrefcnt_frag *n; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Cross-reference with the rmapbt to confirm the refcount. */ + memset(&low, 0, sizeof(low)); + low.rm_startblock = irec->rc_startblock; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; + + INIT_LIST_HEAD(&refchk.fragments); + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, + xchk_rtrefcountbt_rmap_check, &refchk); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + goto out_free; + + xchk_rtrefcountbt_process_rmap_fragments(&refchk); + if (irec->rc_refcount != refchk.seen) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + +out_free: + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_rtrefcountbt_xref( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock), + irec->rc_blockcount); + xchk_rtrefcountbt_xref_rmap(sc, irec); +} + +struct xchk_rtrefcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next rtgroup block where we aren't expecting shared extents. */ + xfs_rgblock_t next_unshared_rgbno; + + /* Number of CoW blocks we expect. */ + xfs_extlen_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +static inline bool +xchk_rtrefcount_mergeable( + struct xchk_rtrefcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + XFS_REFC_LEN_MAX) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rtrefcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rtrefcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rtrefcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + +STATIC int +xchk_rtrefcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_rgblock_t *next_bno = priv; + + if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_rtrefcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_rtrefcbt_records *rrc, + xfs_rtblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_rgblock_t next_bno = NULLRGBLOCK; + int error; + + if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_rgbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, + xchk_rtrefcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur); +} + +/* Scrub a rtrefcountbt record. */ +STATIC int +xchk_rtrefcountbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xchk_rtrefcbt_records *rrc = bs->private; + struct xfs_refcount_irec irec; + u32 mod; + + xfs_refcount_btrec_to_irec(rec, &irec); + if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) != + NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* We can only share full rt extents. */ + mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock); + if (mod) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount); + if (mod) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + rrc->cow_blocks += irec.rc_blockcount; + + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + + xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec); + xchk_rtrefcountbt_xref(bs->sc, &irec); + + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_rgbno = irec.rc_startblock + + irec.rc_blockcount; + } + + return 0; +} + +/* Make sure we have as many refc blocks as the rmap says. */ +STATIC void +xchk_refcount_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_owner_info *btree_oinfo, + xfs_extlen_t cow_blocks) +{ + xfs_filblks_t refcbt_blocks = 0; + xfs_filblks_t blocks; + int error; + + if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Check that we saw as many refcbt blocks as the rmap knows about. */ + error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks); + if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error)) + return; + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo, + &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != refcbt_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + + /* Check that we saw as many cow blocks as the rmap knows about. */ + error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur, + &XFS_RMAP_OINFO_COW, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (blocks != cow_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* Scrub the refcount btree for some AG. */ +int +xchk_rtrefcountbt( + struct xfs_scrub *sc) +{ + struct xfs_owner_info btree_oinfo; + struct xchk_rtrefcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_rgbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; + int error; + + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino, + XFS_DATA_FORK); + error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec, + &btree_oinfo, &rrc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the rt volume have at most one reverse mapping. + */ + xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks); + + xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks); + + return 0; +} + +/* xref check that a cow staging extent is marked in the rtrefcountbt. */ +void +xchk_xref_is_rt_cow_staging( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + struct xfs_refcount_irec rc; + int has_refcount; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + /* Find the CoW staging extent. */ + error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, + bno, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); + return; + } + + error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); + return; + } + + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + + /* Must be at least as long as what was passed in */ + if (rc.rc_blockcount < len) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* + * xref check that the extent is not shared. Only file data blocks + * can have multiple owners. + */ +void +xchk_xref_is_not_rt_shared( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sr.refc_cur, + XFS_REFC_DOMAIN_SHARED, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_rt_cow_staging( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, + bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c new file mode 100644 index 000000000000..983362447826 --- /dev/null +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -0,0 +1,761 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_rtalloc.h" +#include "xfs_ag.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" +#include "scrub/rcbag.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * rt refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the rt refcount btree root and + * insert all the records. + */ + +struct xrep_rtrefc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xfsb_bitmap old_rtrefcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the rt refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_filblks_t btblocks; +}; + +/* Set us up to repair refcount btrees. */ +int +xrep_setup_rtrefcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_rtrefc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + xfs_rgblock_t last; + + if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL) + return -EFSCORRUPTED; + + if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0) + return -EFSCORRUPTED; + + last = rec->rc_startblock + rec->rc_blockcount - 1; + if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1) + return -EFSCORRUPTED; + + /* Make sure this isn't free space or misaligned. */ + return xrep_require_rtext_inuse(sc, rec->rc_startblock, + rec->rc_blockcount); +} + +/* Record a reference count extent. */ +STATIC int +xrep_rtrefc_stash( + struct xrep_rtrefc *rr, + enum xfs_refc_domain domain, + xfs_rgblock_t bno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = bno, + .rc_blockcount = len, + .rc_refcount = refcount, + .rc_domain = domain, + }; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); + + error = xrep_rtrefc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rtrefc_stash_cow( + struct xrep_rtrefc *rr, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_rtrefc_rmap_shareable( + const struct xfs_rmap_irec *rmap) +{ + /* rt metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Unwritten file blocks are not shareable. */ + if (rmap->rm_flags & XFS_RMAP_UNWRITTEN) + return false; + + return true; +} + +/* Grab the next (abbreviated) rmap record from the rmapbt. */ +STATIC int +xrep_rtrefc_walk_rmaps( + struct xrep_rtrefc *rr, + struct xfs_rmap_irec *rmap, + bool *have_rec) +{ + struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } else if (xfs_is_sb_inum(mp, rmap->rm_owner) || + (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + } while (!xrep_rtrefc_rmap_shareable(rmap)); + + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_rtrefc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* + * Compare two refcount records. We want to sort in order of increasing block + * number. + */ +static int +xrep_rtrefc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_rtrefc_encode_startblock(ap); + sb = xrep_rtrefc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_rtrefc_sort_records( + struct xrep_rtrefc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_rgblock_t next_rgbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_rgbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_rgbno) + return -EFSCORRUPTED; + + next_rgbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +/* Record extents that belong to the realtime refcount inode. */ +STATIC int +xrep_rtrefc_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rr->sc->ip->i_ino) + return 0; + + error = xrep_check_ino_btree_mapping(rr->sc, rec); + if (error) + return error; + + return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks, + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), + rec->rm_blockcount); +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_rtrefc_push_rmaps_at( + struct xrep_rtrefc *rr, + struct rcbag *rcstack, + xfs_rgblock_t bno, + struct xfs_rmap_irec *rmap, + bool *have) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); + if (error) + return error; + + error = xrep_rtrefc_walk_rmaps(rr, rmap, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sr.rmap_cur); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Scan one AG for reverse mappings for the realtime refcount btree. */ +STATIC int +xrep_rtrefc_scan_ag( + struct xrep_rtrefc *rr, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_rtrefc_find_refcounts( + struct xrep_rtrefc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct rcbag *rcstack; + struct xfs_perag *pag = NULL; + uint64_t old_stack_height; + xfs_rgblock_t sbno; + xfs_rgblock_t cbno; + xfs_rgblock_t nbno; + bool have; + int error; + + /* Scan for old rtrefc btree blocks. */ + while ((pag = xfs_perag_next(sc->mp, pag))) { + error = xrep_rtrefc_scan_ag(rr, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + xrep_rtgroup_btcur_init(sc, &sc->sr); + + /* + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If this exceeds + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. + */ + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); + if (error) + goto out_cur; + + /* Start the rtrmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sr.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sr.rmap_cur)) { + struct xfs_rmap_irec rmap; + + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rmap.rm_startblock; + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_height = rcbag_count(rcstack); + + /* While stack isn't empty... */ + while (rcbag_count(rcstack) > 0) { + /* Pop all rmaps that end at nbno */ + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { + error = xrep_rtrefc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_height); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (rcbag_count(rcstack) == 0) + break; + old_stack_height = rcbag_count(rcstack); + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(rcbag_count(rcstack) == 0); +out_bag: + rcbag_free(&rcstack); +out_cur: + xchk_rtgroup_btcur_free(&sc->sr); + return error; +} + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_rtrefc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + &cur->bc_rec.rc); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rtrefc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_rtrefc_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level, + nr_this_level); +} + +/* + * Use the collected refcount information to stage a new rt refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_rtrefc_build_new_tree( + struct xrep_rtrefc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_btree_cur *refc_cur; + int error; + + error = xrep_rtrefc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the realtime refcount inode. + */ + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); + if (error) + return error; + + rr->new_btree.bload.get_records = xrep_rtrefc_get_records; + rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block; + rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size; + + refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg); + xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake); + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* + * Guess how many blocks we're going to need to rebuild an entire + * rtrefcountbt from the number of extents we found, and pump up our + * transaction to have sufficient block reservation. We're allowed + * to exceed quota to repair inconsistent metadata, though this is + * unlikely. + */ + error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg), + rr->new_btree.bload.nr_blocks, 0, true); + if (error) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* Add all observed refcount records. */ + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_cur; + + /* + * Install the new rtrefc btree in the inode. After this point the old + * btree is no longer accessible, the new tree is live, and we can + * delete the cursor. + */ + xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp); + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); + xfs_btree_del_cursor(refc_cur, 0); + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_trans(sc); +err_cur: + xfs_btree_del_cursor(refc_cur, error); + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Rebuild the rt refcount btree. */ +int +xrep_rtrefcountbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrefc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rtrmapbt(mp)) + return -EOPNOTSUPP; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per rt extent. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_rextents, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks); + error = xrep_rtrefc_find_refcounts(rr); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the refcount information. */ + error = xrep_rtrefc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c new file mode 100644 index 000000000000..12989fe80e8b --- /dev/null +++ b/fs/xfs/scrub/rtrmap.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_inode.h" +#include "xfs_rtalloc.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" +#include "xfs_refcount.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* Set us up with the realtime metadata locked. */ +int +xchk_setup_rtrmapbt( + struct xfs_scrub *sc) +{ + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtrmapbt(sc); + if (error) + return error; + } + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + + error = xchk_setup_rt(sc); + if (error) + return error; + + error = xchk_install_live_inode(sc, rtg_rmap(sc->sr.rtg)); + if (error) + return error; + + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); +} + +/* Realtime reverse mapping. */ + +struct xchk_rtrmap { + /* + * The furthest-reaching of the rmapbt records that we've already + * processed. This enables us to detect overlapping records for space + * allocations that cannot be shared. + */ + struct xfs_rmap_irec overlap_rec; + + /* + * The previous rmapbt record, so that we can check for two records + * that could be one. + */ + struct xfs_rmap_irec prev_rec; +}; + +static inline bool +xchk_rtrmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_rtreflink(sc->mp)) + return false; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + return false; + return true; +} + +/* Flag failures for records that overlap but cannot. */ +STATIC void +xchk_rtrmapbt_check_overlapping( + struct xchk_btree *bs, + struct xchk_rtrmap *cr, + const struct xfs_rmap_irec *irec) +{ + xfs_rtblock_t pnext, inext; + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + /* No previous record? */ + if (cr->overlap_rec.rm_blockcount == 0) + goto set_prev; + + /* Do overlap_rec and irec overlap? */ + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; + if (pnext <= irec->rm_startblock) + goto set_prev; + + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rtrmapbt_is_shareable(bs->sc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Save whichever rmap record extends furthest. */ + inext = irec->rm_startblock + irec->rm_blockcount; + if (pnext > inext) + return; + +set_prev: + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Decide if two reverse-mapping records can be merged. */ +static inline bool +xchk_rtrmap_mergeable( + struct xchk_rtrmap *cr, + const struct xfs_rmap_irec *r2) +{ + const struct xfs_rmap_irec *r1 = &cr->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (cr->prev_rec.rm_blockcount == 0) + return false; + + if (r1->rm_owner != r2->rm_owner) + return false; + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) + return false; + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > + XFS_RMAP_LEN_MAX) + return false; + if (r1->rm_flags != r2->rm_flags) + return false; + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rtrmapbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rtrmap *cr, + const struct xfs_rmap_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rtrmap_mergeable(cr, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Cross-reference a rmap against the refcount btree. */ +STATIC void +xchk_rtrmapbt_xref_rtrefc( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + xfs_rgblock_t fbno; + xfs_extlen_t flen; + bool is_inode; + bool is_bmbt; + bool is_attr; + bool is_unwritten; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + /* If this is shared, must be a data fork extent. */ + error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock, + irec->rm_blockcount, &fbno, &flen, false); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten)) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* Cross-reference with other metadata. */ +STATIC void +xchk_rtrmapbt_xref( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock), + irec->rm_blockcount); + if (irec->rm_owner == XFS_RMAP_OWN_COW) + xchk_xref_is_cow_staging(sc, irec->rm_startblock, + irec->rm_blockcount); + else + xchk_rtrmapbt_xref_rtrefc(sc, irec); +} + +/* Scrub a realtime rmapbt record. */ +STATIC int +xchk_rtrmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xchk_rtrmap *cr = bs->private; + struct xfs_rmap_irec irec; + + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || + xfs_rtrmap_check_irec(to_rtg(bs->cur->bc_group), &irec) != NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + xchk_rtrmapbt_check_mergeable(bs, cr, &irec); + xchk_rtrmapbt_check_overlapping(bs, cr, &irec); + xchk_rtrmapbt_xref(bs->sc, &irec); + return 0; +} + +/* Scrub the realtime rmap btree. */ +int +xchk_rtrmapbt( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = rtg_rmap(sc->sr.rtg); + struct xfs_owner_info oinfo; + struct xchk_rtrmap cr = { }; + int error; + + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK); + return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr); +} + +/* xref check that the extent has no realtime reverse mapping at all */ +void +xchk_xref_has_no_rt_owner( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* xref check that the extent is completely mapped */ +void +xchk_xref_has_rt_owner( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (outcome != XBTREE_RECPACKING_FULL) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* xref check that the extent is only owned by a given owner */ +void +xchk_xref_is_only_rt_owned_by( + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) +{ + struct xfs_rmap_matches res; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (res.matches != 1) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + if (res.non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c new file mode 100644 index 000000000000..fc2592c53af5 --- /dev/null +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -0,0 +1,987 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_quota.h" +#include "xfs_rtalloc.h" +#include "xfs_ag.h" +#include "xfs_rtgroup.h" +#include "xfs_refcount.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/rgb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Realtime Reverse Mapping Btree Repair + * ===================================== + * + * This isn't quite as difficult as repairing the rmap btree on the data + * device, since we only store the data fork extents of realtime files on the + * realtime device. We still have to freeze the filesystem and stop the + * background threads like we do for the rmap repair, but we only have to scan + * realtime inodes. + * + * Collecting entries for the new realtime rmap btree is easy -- all we have + * to do is generate rtrmap entries from the data fork mappings of all realtime + * files in the filesystem. We then scan the rmap btrees of the data device + * looking for extents belonging to the old btree and note them in a bitmap. + * + * To rebuild the realtime rmap btree, we bulk-load the collected mappings into + * a new btree cursor and atomically swap that into the realtime inode. Then + * we can free the blocks from the old btree. + * + * We use the 'xrep_rtrmap' prefix for all the rmap functions. + */ + +/* Context for collecting rmaps */ +struct xrep_rtrmap { + /* new rtrmapbt information */ + struct xrep_newbt new_btree; + + /* lock for the xfbtree and xfile */ + struct mutex lock; + + /* rmap records generated from primary metadata */ + struct xfbtree rtrmap_btree; + + struct xfs_scrub *sc; + + /* bitmap of old rtrmapbt blocks */ + struct xfsb_bitmap old_rtrmapbt_blocks; + + /* Hooks into rtrmap update code. */ + struct xfs_rmap_hook rhook; + + /* inode scan cursor */ + struct xchk_iscan iscan; + + /* in-memory btree cursor for the ->get_blocks walk */ + struct xfs_btree_cur *mcur; + + /* Number of records we're staging in the new btree. */ + uint64_t nr_records; +}; + +/* Set us up to repair rt reverse mapping btrees. */ +int +xrep_setup_rtrmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrmap *rr; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); + + descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + + rr->sc = sc; + sc->buf = rr; + return 0; +} + +/* Make sure there's nothing funny about this mapping. */ +STATIC int +xrep_rtrmap_check_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + return xrep_require_rtext_inuse(sc, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Store a reverse-mapping record. */ +static inline int +xrep_rtrmap_stash( + struct xrep_rtrmap *rr, + xfs_rgblock_t startblock, + xfs_extlen_t blockcount, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + struct xfs_rmap_irec rmap = { + .rm_startblock = startblock, + .rm_blockcount = blockcount, + .rm_owner = owner, + .rm_offset = offset, + .rm_flags = flags, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *mcur; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + trace_xrep_rtrmap_found(sc->mp, &rmap); + + /* Add entry to in-memory btree. */ + mutex_lock(&rr->lock); + mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree); + error = xfs_rmap_map_raw(mcur, &rmap); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp); + if (error) + goto out_abort; + + mutex_unlock(&rr->lock); + return 0; + +out_cancel: + xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); + return error; +} + +/* Finding all file and bmbt extents. */ + +/* Context for accumulating rmaps for an inode fork. */ +struct xrep_rtrmap_ifork { + /* + * Accumulate rmap data here to turn multiple adjacent bmaps into a + * single rmap. + */ + struct xfs_rmap_irec accum; + + struct xrep_rtrmap *rr; +}; + +/* Stash an rmap that we accumulated while walking an inode fork. */ +STATIC int +xrep_rtrmap_stash_accumulated( + struct xrep_rtrmap_ifork *rf) +{ + if (rf->accum.rm_blockcount == 0) + return 0; + + return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock, + rf->accum.rm_blockcount, rf->accum.rm_owner, + rf->accum.rm_offset, rf->accum.rm_flags); +} + +/* Accumulate a bmbt record. */ +STATIC int +xrep_rtrmap_visit_bmbt( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv) +{ + struct xrep_rtrmap_ifork *rf = priv; + struct xfs_rmap_irec *accum = &rf->accum; + struct xfs_mount *mp = rf->rr->sc->mp; + xfs_rgblock_t rgbno; + unsigned int rmap_flags = 0; + int error; + + if (xfs_rtb_to_rgno(mp, rec->br_startblock) != + rtg_rgno(rf->rr->sc->sr.rtg)) + return 0; + + if (rec->br_state == XFS_EXT_UNWRITTEN) + rmap_flags |= XFS_RMAP_UNWRITTEN; + + /* If this bmap is adjacent to the previous one, just add it. */ + rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock); + if (accum->rm_blockcount > 0 && + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && + rgbno == accum->rm_startblock + accum->rm_blockcount && + rmap_flags == accum->rm_flags) { + accum->rm_blockcount += rec->br_blockcount; + return 0; + } + + /* Otherwise stash the old rmap and start accumulating a new one. */ + error = xrep_rtrmap_stash_accumulated(rf); + if (error) + return error; + + accum->rm_startblock = rgbno; + accum->rm_blockcount = rec->br_blockcount; + accum->rm_offset = rec->br_startoff; + accum->rm_flags = rmap_flags; + return 0; +} + +/* + * Iterate the block mapping btree to collect rmap records for anything in this + * fork that maps to the rt volume. Sets @mappings_done to true if we've + * scanned the block mappings in this fork. + */ +STATIC int +xrep_rtrmap_scan_bmbt( + struct xrep_rtrmap_ifork *rf, + struct xfs_inode *ip, + bool *mappings_done) +{ + struct xrep_rtrmap *rr = rf->rr; + struct xfs_btree_cur *cur; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + int error = 0; + + *mappings_done = false; + + /* + * If the incore extent cache is already loaded, we'll just use the + * incore extent scanner to record mappings. Don't bother walking the + * ondisk extent tree. + */ + if (!xfs_need_iread_extents(ifp)) + return 0; + + /* Accumulate all the mappings in the bmap btree. */ + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK); + error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf); + xfs_btree_del_cursor(cur, error); + if (error) + return error; + + /* Stash any remaining accumulated rmaps and exit. */ + *mappings_done = true; + return xrep_rtrmap_stash_accumulated(rf); +} + +/* + * Iterate the in-core extent cache to collect rmap records for anything in + * this fork that matches the AG. + */ +STATIC int +xrep_rtrmap_scan_iext( + struct xrep_rtrmap_ifork *rf, + struct xfs_ifork *ifp) +{ + struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; + int error; + + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf); + if (error) + return error; + } + + return xrep_rtrmap_stash_accumulated(rf); +} + +/* Find all the extents on the realtime device mapped by an inode fork. */ +STATIC int +xrep_rtrmap_scan_dfork( + struct xrep_rtrmap *rr, + struct xfs_inode *ip) +{ + struct xrep_rtrmap_ifork rf = { + .accum = { .rm_owner = ip->i_ino, }, + .rr = rr, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + int error = 0; + + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + bool mappings_done; + + /* + * Scan the bmbt for mappings. If the incore extent tree is + * loaded, we want to scan the cached mappings since that's + * faster when the extent counts are very high. + */ + error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done); + if (error || mappings_done) + return error; + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + /* realtime data forks should only be extents or btree */ + return -EFSCORRUPTED; + } + + /* Scan incore extent cache. */ + return xrep_rtrmap_scan_iext(&rf, ifp); +} + +/* Record reverse mappings for a file. */ +STATIC int +xrep_rtrmap_scan_inode( + struct xrep_rtrmap *rr, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + /* Skip the rt rmap btree inode. */ + if (rr->sc->ip == ip) + return 0; + + lock_mode = xfs_ilock_data_map_shared(ip); + + /* Check the data fork if it's on the realtime device. */ + if (XFS_IS_REALTIME_INODE(ip)) { + error = xrep_rtrmap_scan_dfork(rr, ip); + if (error) + goto out_unlock; + } + + xchk_iscan_mark_visited(&rr->iscan, ip); +out_unlock: + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Record extents that belong to the realtime rmap inode. */ +STATIC int +xrep_rtrmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rr->sc->ip->i_ino) + return 0; + + error = xrep_check_ino_btree_mapping(rr->sc, rec); + if (error) + return error; + + return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks, + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), + rec->rm_blockcount); +} + +/* Scan one AG for reverse mappings for the realtime rmap btree. */ +STATIC int +xrep_rtrmap_scan_ag( + struct xrep_rtrmap *rr, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr); + xchk_ag_free(sc, &sc->sa); + return error; +} + +struct xrep_rtrmap_stash_run { + struct xrep_rtrmap *rr; + uint64_t owner; +}; + +static int +xrep_rtrmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rtrmap_stash_run *rsr = priv; + struct xrep_rtrmap *rr = rsr->rr; + xfs_rgblock_t rgbno = start; + + return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rtrmap_stash_bitmap( + struct xrep_rtrmap *rr, + struct xrgb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rtrmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + }; + + return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rtrmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xrgb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xrgb_bitmap_set(bitmap, irec->rc_startblock, + irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rtrmap_find_refcount_rmaps( + struct xrep_rtrmap *rr) +{ + struct xrgb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_rtreflink(sc->mp)) + return 0; + + xrgb_bitmap_init(&cow_blocks); + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high, + xrep_rtrmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xrgb_bitmap_destroy(&cow_blocks); + return error; +} + +/* Count and check all collected records. */ +STATIC int +xrep_rtrmap_check_record( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + int error; + + error = xrep_rtrmap_check_mapping(rr->sc, rec); + if (error) + return error; + + rr->nr_records++; + return 0; +} + +/* Generate all the reverse-mappings for the realtime device. */ +STATIC int +xrep_rtrmap_find_rmaps( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = NULL; + struct xfs_inode *ip; + struct xfs_btree_cur *mcur; + int error; + + /* Generate rmaps for the realtime superblock */ + if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) { + error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize, + XFS_RMAP_OWN_FS, 0, 0); + if (error) + return error; + } + + /* Find CoW staging extents. */ + xrep_rtgroup_btcur_init(sc, &sc->sr); + error = xrep_rtrmap_find_refcount_rmaps(rr); + xchk_rtgroup_btcur_free(&sc->sr); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Unlock the realtime metadata inodes and cancel the transaction to + * release the log grant space while we scan the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + xchk_rtgroup_unlock(&sc->sr); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { + error = xrep_rtrmap_scan_inode(rr, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rr->iscan); + if (error) + return error; + + /* + * Switch out for a real transaction and lock the RT metadata in + * preparation for building a new tree. + */ + xchk_trans_cancel(sc); + error = xchk_setup_rt(sc); + if (error) + return error; + error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); + if (error) + return error; + + /* + * If a hook failed to update the in-memory btree, we lack the data to + * continue the repair. + */ + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + /* Scan for old rtrmap blocks. */ + while ((pag = xfs_perag_next(sc->mp, pag))) { + error = xrep_rtrmap_scan_ag(rr, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* + * Now that we have everything locked again, we need to count the + * number of rmap records stashed in the btree. This should reflect + * all actively-owned rt files in the filesystem. At the same time, + * check all our records before we start building a new btree, which + * requires the rtbitmap lock. + */ + mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree); + rr->nr_records = 0; + error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr); + xfs_btree_del_cursor(mcur, error); + + return error; +} + +/* Building the new rtrmap btree. */ + +/* Retrieve rtrmapbt data for bulk load. */ +STATIC int +xrep_rtrmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + int stat = 0; + + error = xfs_btree_increment(rr->mcur, 0, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rtrmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_rtrmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level); +} + +/* + * Use the collected rmap information to stage a new rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. This implements section (III) + * above. + */ +STATIC int +xrep_rtrmap_build_new_tree( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_btree_cur *rmap_cur; + int error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the realtime rmapbt inode. + */ + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); + if (error) + return error; + + rr->new_btree.bload.get_records = xrep_rtrmap_get_records; + rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block; + rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size; + + rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake); + + /* Compute how many blocks we'll need for the rmaps collected. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* + * Guess how many blocks we're going to need to rebuild an entire + * rtrmapbt from the number of extents we found, and pump up our + * transaction to have sufficient block reservation. We're allowed + * to exceed quota to repair inconsistent metadata, though this is + * unlikely. + */ + error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg), + rr->new_btree.bload.nr_blocks, 0, true); + if (error) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Create a cursor to the in-memory btree so that we can bulk load the + * new btree. + */ + rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree); + error = xfs_btree_goto_left_edge(rr->mcur); + if (error) + goto err_mcur; + + /* Add all observed rmap records. */ + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); + if (error) + goto err_mcur; + + /* + * Install the new rtrmap btree in the inode. After this point the old + * btree is no longer accessible, the new tree is live, and we can + * delete the cursor. + */ + xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp); + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); + xfs_btree_del_cursor(rmap_cur, 0); + xfs_btree_del_cursor(rr->mcur, 0); + rr->mcur = NULL; + + /* + * Now that we've written the new btree to disk, we don't need to keep + * updating the in-memory btree. Abort the scan to stop live updates. + */ + xchk_iscan_abort(&rr->iscan); + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_mcur: + xfs_btree_del_cursor(rr->mcur, error); +err_cur: + xfs_btree_del_cursor(rmap_cur, error); + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Reaping the old btree. */ + +static inline bool +xrep_rtrmapbt_want_live_update( + struct xchk_iscan *iscan, + const struct xfs_owner_info *oi) +{ + if (xchk_iscan_aborted(iscan)) + return false; + + /* + * We scanned the CoW staging extents before we started the iscan, so + * we need all the updates. + */ + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) + return true; + + /* Ignore updates to files that the scanner hasn't visited yet. */ + return xchk_iscan_want_live_update(iscan, oi->oi_owner); +} + +/* + * Apply a rtrmapbt update from the regular filesystem into our shadow btree. + * We're running from the thread that owns the rtrmap ILOCK and is generating + * the update, so we must be careful about which parts of the struct + * xrep_rtrmap that we change. + */ +static int +xrep_rtrmapbt_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_rmap_update_params *p = data; + struct xrep_rtrmap *rr; + struct xfs_mount *mp; + struct xfs_btree_cur *mcur; + struct xfs_trans *tp; + void *txcookie; + int error; + + rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb); + mp = rr->sc->mp; + + if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo)) + goto out_unlock; + + trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p); + + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); + if (error) + goto out_abort; + + mutex_lock(&rr->lock); + mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree); + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, + p->blockcount, &p->oinfo, p->unwritten); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rtrmap_btree, tp); + if (error) + goto out_cancel; + + xrep_trans_cancel_hook_dummy(&txcookie, tp); + mutex_unlock(&rr->lock); + return NOTIFY_DONE; + +out_cancel: + xfbtree_trans_cancel(&rr->rtrmap_btree, tp); + xrep_trans_cancel_hook_dummy(&txcookie, tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); +out_unlock: + return NOTIFY_DONE; +} + +/* Set up the filesystem scan components. */ +STATIC int +xrep_rtrmap_setup_scan( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + mutex_init(&rr->lock); + xfsb_bitmap_init(&rr->old_rtrmapbt_blocks); + + /* Set up some storage */ + error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp, + rtg_rgno(sc->sr.rtg)); + if (error) + goto out_bitmap; + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &rr->iscan); + + /* + * Hook into live rtrmap operations so that we can update our in-memory + * btree to reflect live changes on the filesystem. Since we drop the + * rtrmap ILOCK to scan all the inodes, we need this piece to avoid + * installing a stale btree. + */ + ASSERT(sc->flags & XCHK_FSGATES_RMAP); + xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update); + error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook); + if (error) + goto out_iscan; + return 0; + +out_iscan: + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rtrmap_btree); +out_bitmap: + xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks); + mutex_destroy(&rr->lock); + return error; +} + +/* Tear down scan components. */ +STATIC void +xrep_rtrmap_teardown( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + xchk_iscan_abort(&rr->iscan); + xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook); + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rtrmap_btree); + xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks); + mutex_destroy(&rr->lock); +} + +/* Repair the realtime rmap btree. */ +int +xrep_rtrmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrmap *rr = sc->buf; + int error; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + error = xrep_rtrmap_setup_scan(rr); + if (error) + return error; + + /* Collect rmaps for realtime files. */ + error = xrep_rtrmap_find_rmaps(rr); + if (error) + goto out_records; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the rtrmap information. */ + error = xrep_rtrmap_build_new_tree(rr); + if (error) + goto out_records; + + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); + if (error) + goto out_records; + +out_records: + xrep_rtrmap_teardown(rr); + return error; +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 7c7366c98338..4ac679c1bd29 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_sb.h" #include "xfs_exchmaps.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -46,12 +47,19 @@ xchk_setup_rtsummary( struct xchk_rtsummary *rts; int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), XCHK_GFP_FLAGS); if (!rts) return -ENOMEM; sc->buf = rts; + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + if (xchk_could_repair(sc)) { error = xrep_setup_rtsummary(sc, rts); if (error) @@ -73,7 +81,7 @@ xchk_setup_rtsummary( if (error) return error; - error = xchk_install_live_inode(sc, mp->m_rsumip); + error = xchk_install_live_inode(sc, rtg_summary(sc->sr.rtg)); if (error) return error; @@ -81,30 +89,27 @@ xchk_setup_rtsummary( if (error) return error; - /* - * Locking order requires us to take the rtbitmap first. We must be - * careful to unlock it ourselves when we are done with the rtbitmap - * file since the scrub infrastructure won't do that for us. Only - * then we can lock the rtsummary inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP); + if (error) + return error; /* * Now that we've locked the rtbitmap and rtsummary, we can't race with * growfsrt trying to expand the summary or change the size of the rt * volume. Hence it is safe to compute and check the geometry values. + * + * Note that there is no strict requirement for an exclusive lock on the + * summary here, but to keep the locking APIs simple we lock both inodes + * exclusively here. If we ever start caring about running concurrent + * fsmap with scrub this could be changed. */ if (mp->m_sb.sb_rblocks) { - int rextslog; - - rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); - rextslog = xfs_compute_rextslog(rts->rextents); - rts->rsumlevels = rextslog + 1; - rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); - rts->rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, - rts->rbmblocks); + rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); + rts->rbmblocks = xfs_rtbitmap_blockcount(mp); + rts->rsumblocks = + xfs_rtsummary_blockcount(mp, &rts->rsumlevels); } + return 0; } @@ -148,6 +153,11 @@ xchk_rtsum_inc( struct xfs_mount *mp, union xfs_suminfo_raw *v) { + if (xfs_has_rtgroups(mp)) { + be32_add_cpu(&v->rtg, 1); + return be32_to_cpu(v->rtg); + } + v->old += 1; return v->old; } @@ -155,11 +165,12 @@ xchk_rtsum_inc( /* Update the summary file to reflect the free extent that we've accumulated. */ STATIC int xchk_rtsum_record_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_scrub *sc = priv; xfs_fileoff_t rbmoff; xfs_rtblock_t rtbno; @@ -178,11 +189,11 @@ xchk_rtsum_record_free( lenlog = xfs_highbit64(rec->ar_extcount); offs = xfs_rtsumoffs(mp, lenlog, rbmoff); - rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + rtbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); + rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount); if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { - xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); return -EFSCORRUPTED; } @@ -204,15 +215,14 @@ xchk_rtsum_compute( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - unsigned long long rtbmp_blocks; + struct xfs_rtgroup *rtg = sc->sr.rtg; /* If the bitmap size doesn't match the computed size, bail. */ - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); - if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) + if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) != + rtg_bitmap(rtg)->i_disk_size) return -EFSCORRUPTED; - return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, - sc); + return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc); } /* Compare the rtsummary file against the one we computed. */ @@ -231,8 +241,9 @@ xchk_rtsum_compare( xfs_rtsumoff_t sumoff = 0; int error = 0; - rts->args.mp = sc->mp; + rts->args.mp = mp; rts->args.tp = sc->tp; + rts->args.rtg = sc->sr.rtg; /* Mappings may not cross or lie beyond EOF. */ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); @@ -299,31 +310,34 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg_bitmap(rtg); + struct xfs_inode *rsumip = rtg_summary(rtg); struct xchk_rtsummary *rts = sc->buf; - int error = 0; + int error; /* Is sb_rextents correct? */ if (mp->m_sb.sb_rextents != rts->rextents) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rbmip->i_ino); + return 0; } /* Is m_rsumlevels correct? */ if (mp->m_rsumlevels != rts->rsumlevels) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* Is m_rsumsize correct? */ if (mp->m_rsumblocks != rts->rsumblocks) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* The summary file length must be aligned to an fsblock. */ - if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* @@ -331,15 +345,15 @@ xchk_rtsummary( * growfsrt expands the summary file before updating sb_rextents, so * the file can be larger than rsumsize. */ - if (mp->m_rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out_rbm; + return error; /* Construct the new summary file from the rtbitmap. */ error = xchk_rtsum_compute(sc); @@ -348,23 +362,12 @@ xchk_rtsummary( * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref * error since we're checking the summary file. */ - xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); - error = 0; - goto out_rbm; + xchk_ino_set_corrupt(sc, rbmip->i_ino); + return 0; } if (error) - goto out_rbm; + return error; /* Does the computed summary file match the actual rtsummary file? */ - error = xchk_rtsum_compare(sc); - -out_rbm: - /* - * Unlock the rtbitmap since we're done with it. All other writers of - * the rt free space metadata grab the bitmap and summary ILOCKs in - * that order, so we're still protected against allocation activities - * even if we continue on to the repair function. - */ - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - return error; + return xchk_rtsum_compare(sc); } diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c index 7deeb948cb70..d593977d70df 100644 --- a/fs/xfs/scrub/rtsummary_repair.c +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -76,18 +76,30 @@ xrep_rtsummary_prep_buf( union xfs_suminfo_raw *ondisk; int error; - rts->args.mp = sc->mp; + rts->args.mp = mp; rts->args.tp = sc->tp; + rts->args.rtg = sc->sr.rtg; rts->args.sumbp = bp; ondisk = xfs_rsumblock_infoptr(&rts->args, 0); rts->args.sumbp = NULL; - bp->b_ops = &xfs_rtbuf_ops; - error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); if (error) return error; + if (xfs_has_rtgroups(sc->mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(sc->ip->i_ino); + hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid); + bp->b_ops = &xfs_rtsummary_buf_ops; + } else { + bp->b_ops = &xfs_rtbuf_ops; + } + rts->prep_wordoff += mp->m_blockwsize; xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); return 0; @@ -153,7 +165,8 @@ xrep_rtsummary( * Now exchange the contents. Nothing in repair uses the temporary * buffer, so we can reuse it for the tempfile exchrange information. */ - error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch); + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0, + rts->rsumblocks, &rts->tempexch); if (error) return error; @@ -162,8 +175,8 @@ xrep_rtsummary( return error; /* Reset incore state and blow out the summary cache. */ - if (mp->m_rsum_cache) - memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + if (sc->sr.rtg->rtg_rsum_cache) + memset(sc->sr.rtg->rtg_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); mp->m_rsumlevels = rts->rsumlevels; mp->m_rsumblocks = rts->rsumblocks; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4cbcf7a86dbe..76e24032e99a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } @@ -164,7 +176,7 @@ xchk_fsgates_disable( trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); if (sc->flags & XCHK_FSGATES_DRAIN) - xfs_drain_wait_disable(); + xfs_defer_drain_wait_disable(); if (sc->flags & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_disable(); @@ -218,6 +230,8 @@ xchk_teardown( int error) { xchk_ag_free(sc, &sc->sa); + xchk_rtgroup_btcur_free(&sc->sr); + if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) error = xfs_trans_commit(sc->tp); @@ -225,6 +239,8 @@ xchk_teardown( xfs_trans_cancel(sc->tp); sc->tp = NULL; } + if (sc->sr.rtg) + xchk_rtgroup_free(sc, &sc->sr); if (sc->ip) { if (sc->ilock_flags) xchk_iunlock(sc, sc->ilock_flags); @@ -382,13 +398,15 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .repair = xrep_parent, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ - .type = ST_FS, + .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ - .type = ST_FS, + .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .repair = xrep_rtsummary, @@ -442,6 +460,34 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .has = xfs_has_parent, .repair = xrep_dirtree, }, + [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */ + .type = ST_GENERIC, + .setup = xchk_setup_metapath, + .scrub = xchk_metapath, + .has = xfs_has_metadir, + .repair = xrep_metapath, + }, + [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */ + .type = ST_RTGROUP, + .setup = xchk_setup_rgsuperblock, + .scrub = xchk_rgsuperblock, + .has = xfs_has_rtsb, + .repair = xrep_rgsuperblock, + }, + [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */ + .type = ST_RTGROUP, + .setup = xchk_setup_rtrmapbt, + .scrub = xchk_rtrmapbt, + .has = xfs_has_rtrmapbt, + .repair = xrep_rtrmapbt, + }, + [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */ + .type = ST_RTGROUP, + .setup = xchk_setup_rtrefcountbt, + .scrub = xchk_rtrefcountbt, + .has = xfs_has_rtreflink, + .repair = xrep_rtrefcountbt, + }, }; static int @@ -489,6 +535,35 @@ xchk_validate_inputs( if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) goto out; break; + case ST_GENERIC: + break; + case ST_RTGROUP: + if (sm->sm_ino || sm->sm_gen) + goto out; + if (xfs_has_rtgroups(mp)) { + /* + * On a rtgroups filesystem, there won't be an rtbitmap + * or rtsummary file for group 0 unless there's + * actually a realtime volume attached. However, older + * xfs_scrub always calls the rtbitmap/rtsummary + * scrubbers with sm_agno==0 so transform the error + * code to ENOENT. + */ + if (sm->sm_agno >= mp->m_sb.sb_rgcount) { + if (sm->sm_agno == 0) + error = -ENOENT; + goto out; + } + } else { + /* + * Prior to rtgroups, the rtbitmap/rtsummary scrubbers + * accepted sm_agno==0, so we still accept that for + * scrubbing pre-rtgroups filesystems. + */ + if (sm->sm_agno != 0) + goto out; + } + break; default: goto out; } @@ -605,9 +680,6 @@ xfs_scrub_metadata( if (error) goto out; - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, - "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); - sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5993fcaffb2c..a3f1abc91390 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -73,6 +73,8 @@ enum xchk_type { ST_PERAG, /* per-AG metadata */ ST_FS, /* per-FS metadata */ ST_INODE, /* per-inode metadata */ + ST_GENERIC, /* determined by the scrubber */ + ST_RTGROUP, /* rtgroup metadata */ }; struct xchk_meta_ops { @@ -94,7 +96,7 @@ struct xchk_meta_ops { int (*repair_eval)(struct xfs_scrub *sc); /* Decide if we even have this piece of metadata. */ - bool (*has)(struct xfs_mount *); + bool (*has)(const struct xfs_mount *); /* type describing required/allowed inputs */ enum xchk_type type; @@ -117,6 +119,19 @@ struct xchk_ag { struct xfs_btree_cur *refc_cur; }; +/* Inode lock state for the RT volume. */ +struct xchk_rt { + /* incore rtgroup, if applicable */ + struct xfs_rtgroup *rtg; + + /* XFS_RTGLOCK_* lock state if locked */ + unsigned int rtlock_flags; + + /* rtgroup btrees */ + struct xfs_btree_cur *rmap_cur; + struct xfs_btree_cur *refc_cur; +}; + struct xfs_scrub { /* General scrub state. */ struct xfs_mount *mp; @@ -173,11 +188,20 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* + * Clear these XFS_SICK_* flags but only if the scan is ok. Useful for + * removing ZAPPED flags after a repair. + */ + unsigned int healthy_mask; + /* next time we want to cond_resched() */ struct xchk_relax relax; /* State tracking for single-AG operations. */ struct xchk_ag sa; + + /* State tracking for realtime operations. */ + struct xchk_rt sr; }; /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ @@ -255,12 +279,19 @@ int xchk_xattr(struct xfs_scrub *sc); int xchk_symlink(struct xfs_scrub *sc); int xchk_parent(struct xfs_scrub *sc); int xchk_dirtree(struct xfs_scrub *sc); +int xchk_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); +int xchk_rgsuperblock(struct xfs_scrub *sc); +int xchk_rtrmapbt(struct xfs_scrub *sc); +int xchk_rtrefcountbt(struct xfs_scrub *sc); #else # define xchk_rtbitmap xchk_nothing # define xchk_rtsummary xchk_nothing +# define xchk_rgsuperblock xchk_nothing +# define xchk_rtrmapbt xchk_nothing +# define xchk_rtrefcountbt xchk_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); @@ -294,8 +325,26 @@ void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, #ifdef CONFIG_XFS_RT void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, xfs_extlen_t len); +void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo); +void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); #else # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) +# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0) +# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0) +# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0) +# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0) +# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0) +# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0) #endif #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index 7996c2335476..f8a37ea97791 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -80,6 +80,10 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", [XFS_SCRUB_TYPE_NLINKS] = "nlinks", [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", + [XFS_SCRUB_TYPE_METAPATH] = "metapath", + [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper", + [XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt", + [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index d015a86ef460..953ce7be78dc 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -36,6 +36,7 @@ #include "scrub/tempfile.h" #include "scrub/tempexch.h" #include "scrub/reap.h" +#include "scrub/health.h" /* * Symbolic Link Repair @@ -233,7 +234,7 @@ xrep_symlink_salvage( * target zapped flag. */ if (buflen == 0) { - sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); sprintf(target_buf, DUMMY_TARGET); } diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h index 995ba187c5aa..eccda720c2ca 100644 --- a/fs/xfs/scrub/tempexch.h +++ b/fs/xfs/scrub/tempexch.h @@ -12,7 +12,7 @@ struct xrep_tempexch { }; int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork, - struct xrep_tempexch *ti); + xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempexch *ti); int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork, struct xrep_tempexch *ti); diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index 177f922acfaf..cf99e0ca51b0 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -22,6 +22,7 @@ #include "xfs_exchmaps.h" #include "xfs_defer.h" #include "xfs_symlink_remote.h" +#include "xfs_metafile.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" @@ -182,6 +183,114 @@ out_release_dquots: return error; } +/* + * Move sc->tempip from the regular directory tree to the metadata directory + * tree if sc->ip is part of the metadata directory tree and tempip has an + * eligible file mode. + * + * Temporary files have to be created before we even know which inode we're + * going to scrub, so we assume that they will be part of the regular directory + * tree. If it turns out that we're actually scrubbing a file from the + * metadata directory tree, we have to subtract the temp file from the root + * dquots and detach the dquots prior to setting the METADATA iflag. However, + * the scrub setup functions grab sc->ip and create sc->tempip before we + * actually get around to checking if the file mode is the right type for the + * scrubber. + */ +int +xrep_tempfile_adjust_directory_tree( + struct xfs_scrub *sc) +{ + int error; + + if (!sc->tempip) + return 0; + + ASSERT(sc->tp == NULL); + ASSERT(!xfs_is_metadir_inode(sc->tempip)); + + if (!sc->ip || !xfs_is_metadir_inode(sc->ip)) + return 0; + if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) && + !S_ISREG(VFS_I(sc->tempip)->i_mode)) + return 0; + + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_iolock; + + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* Metadir files are not accounted in quota, so drop icount */ + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L); + xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN); + + error = xrep_trans_commit(sc); + if (error) + goto out_ilock; + + xfs_iflags_set(sc->tempip, XFS_IRECOVERY); + xfs_qm_dqdetach(sc->tempip); +out_ilock: + xrep_tempfile_iunlock(sc); +out_iolock: + xrep_tempfile_iounlock(sc); + return error; +} + +/* + * Remove this temporary file from the metadata directory tree so that it can + * be inactivated the normal way. + */ +STATIC int +xrep_tempfile_remove_metadir( + struct xfs_scrub *sc) +{ + int error; + + if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip)) + return 0; + + ASSERT(sc->tp == NULL); + + xfs_iflags_clear(sc->tempip, XFS_IRECOVERY); + + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_iolock; + + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + xfs_metafile_clear_iflag(sc->tp, sc->tempip); + + /* Non-metadir files are accounted in quota, so bump bcount/icount */ + error = xfs_qm_dqattach_locked(sc->tempip, false); + if (error) + goto out_cancel; + + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L); + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT, + sc->tempip->i_nblocks); + error = xrep_trans_commit(sc); + goto out_ilock; + +out_cancel: + xchk_trans_cancel(sc); +out_ilock: + xrep_tempfile_iunlock(sc); +out_iolock: + xrep_tempfile_iounlock(sc); + return error; +} + /* Take IOLOCK_EXCL on the temporary file, maybe. */ bool xrep_tempfile_iolock_nowait( @@ -290,6 +399,7 @@ xrep_tempfile_rele( sc->temp_ilock_flags = 0; } + xrep_tempfile_remove_metadir(sc); xchk_irele(sc, sc->tempip); sc->tempip = NULL; } @@ -496,6 +606,8 @@ STATIC int xrep_tempexch_prep_request( struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, struct xrep_tempexch *tx) { struct xfs_exchmaps_req *req = &tx->req; @@ -519,18 +631,19 @@ xrep_tempexch_prep_request( /* Exchange all mappings in both forks. */ req->ip1 = sc->tempip; req->ip2 = sc->ip; - req->startoff1 = 0; - req->startoff2 = 0; + req->startoff1 = off; + req->startoff2 = off; switch (whichfork) { case XFS_ATTR_FORK: req->flags |= XFS_EXCHMAPS_ATTR_FORK; break; case XFS_DATA_FORK: - /* Always exchange sizes when exchanging data fork mappings. */ - req->flags |= XFS_EXCHMAPS_SET_SIZES; + /* Exchange sizes when exchanging all data fork mappings. */ + if (off == 0 && len == XFS_MAX_FILEOFF) + req->flags |= XFS_EXCHMAPS_SET_SIZES; break; } - req->blockcount = XFS_MAX_FILEOFF; + req->blockcount = len; return 0; } @@ -639,6 +752,7 @@ xrep_tempexch_reserve_quota( * or the two inodes have the same dquots. */ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + xfs_is_metadir_inode(req->ip1) || (req->ip1->i_udquot == req->ip2->i_udquot && req->ip1->i_gdquot == req->ip2->i_gdquot && req->ip1->i_pdquot == req->ip2->i_pdquot)) @@ -685,6 +799,8 @@ int xrep_tempexch_trans_reserve( struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, struct xrep_tempexch *tx) { int error; @@ -693,7 +809,7 @@ xrep_tempexch_trans_reserve( xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); - error = xrep_tempexch_prep_request(sc, whichfork, tx); + error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx); if (error) return error; @@ -731,7 +847,8 @@ xrep_tempexch_trans_alloc( ASSERT(sc->tp == NULL); ASSERT(xfs_has_exchange_range(sc->mp)); - error = xrep_tempexch_prep_request(sc, whichfork, tx); + error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF, + tx); if (error) return error; @@ -844,6 +961,17 @@ xrep_is_tempfile( const struct xfs_inode *ip) { const struct inode *inode = &ip->i_vnode; + struct xfs_mount *mp = ip->i_mount; + + /* + * Files in the metadata directory tree also have S_PRIVATE set and + * IOP_XATTR unset, so we must distinguish them separately. We (ab)use + * the IRECOVERY flag to mark temporary metadir inodes knowing that the + * end of log recovery clears IRECOVERY, so the only ones that can + * exist during online repair are the ones we create. + */ + if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA)) + return __xfs_iflags_test(ip, XFS_IRECOVERY); if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) return true; diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h index e51399f595fe..71c1b54599c3 100644 --- a/fs/xfs/scrub/tempfile.h +++ b/fs/xfs/scrub/tempfile.h @@ -10,6 +10,8 @@ int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode); void xrep_tempfile_rele(struct xfs_scrub *sc); +int xrep_tempfile_adjust_directory_tree(struct xfs_scrub *sc); + bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc); int xrep_tempfile_iolock_polled(struct xfs_scrub *sc); void xrep_tempfile_iounlock(struct xfs_scrub *sc); @@ -42,6 +44,7 @@ static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) xchk_ilock(sc, XFS_IOLOCK_EXCL); } # define xrep_is_tempfile(ip) (false) +# define xrep_tempfile_adjust_directory_tree(sc) (0) # define xrep_tempfile_rele(sc) #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 4470ad0533b8..2450e214103f 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -20,6 +20,8 @@ #include "xfs_dir2.h" #include "xfs_rmap.h" #include "xfs_parent.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c886d5d0eb02..d7c4ced47c15 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -17,6 +17,7 @@ #include "xfs_bit.h" #include "xfs_quota_defs.h" +struct xfs_rtgroup; struct xfs_scrub; struct xfile; struct xfarray; @@ -40,6 +41,9 @@ struct xchk_dirtree_outcomes; TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); +TRACE_DEFINE_ENUM(XG_TYPE_AG); +TRACE_DEFINE_ENUM(XG_TYPE_RTG); + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); @@ -70,6 +74,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -101,7 +109,11 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ - { XFS_SCRUB_TYPE_BARRIER, "barrier" } + { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \ + { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \ + { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \ + { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ + { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -601,7 +613,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; - __entry->ino = sc->ip->i_ino; + __entry->ino = cur->bc_ino.ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __assign_str(name); @@ -772,12 +784,12 @@ TRACE_EVENT(xchk_xref_error, ); TRACE_EVENT(xchk_iallocbt_check_cluster, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, xfs_daddr_t map_daddr, - unsigned short map_len, unsigned int chunk_ino, - unsigned int nr_inodes, uint16_t cluster_mask, - uint16_t holemask, unsigned int cluster_ino), - TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + TP_PROTO(const struct xfs_perag *pag, xfs_agino_t startino, + xfs_daddr_t map_daddr, unsigned short map_len, + unsigned int chunk_ino, unsigned int nr_inodes, + uint16_t cluster_mask, uint16_t holemask, + unsigned int cluster_ino), + TP_ARGS(pag, startino, map_daddr, map_len, chunk_ino, nr_inodes, cluster_mask, holemask, cluster_ino), TP_STRUCT__entry( __field(dev_t, dev) @@ -792,8 +804,8 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __field(uint16_t, holemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = startino; __entry->map_daddr = map_daddr; __entry->map_len = map_len; @@ -922,7 +934,8 @@ DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); TRACE_EVENT(xchk_refcount_incorrect, - TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_refcount_irec *irec, xfs_nlink_t seen), TP_ARGS(pag, irec, seen), TP_STRUCT__entry( @@ -935,8 +948,8 @@ TRACE_EVENT(xchk_refcount_incorrect, __field(xfs_nlink_t, seen) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -1752,6 +1765,7 @@ DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen); DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent); DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent); DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_crosses_tree); TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING); TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE); @@ -1914,34 +1928,72 @@ TRACE_EVENT(xchk_dirtree_live_update, __get_str(name)) ); +DECLARE_EVENT_CLASS(xchk_metapath_class, + TP_PROTO(struct xfs_scrub *sc, const char *path, + struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(sc, path, dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, scrub_ino) + __field(xfs_ino_t, parent_ino) + __field(xfs_ino_t, ino) + __string(name, path) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->scrub_ino = sc->ip ? sc->ip->i_ino : NULLFSINO; + __entry->parent_ino = dp ? dp->i_ino : NULLFSINO; + __entry->ino = ino; + __assign_str(name); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx name '%s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->scrub_ino, + __entry->parent_ino, + __get_str(name), + __entry->ino) +); +#define DEFINE_XCHK_METAPATH_EVENT(name) \ +DEFINE_EVENT(xchk_metapath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, const char *path, \ + struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(sc, path, dp, ino)) +DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(pag, agbno, len), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len) ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(pag, agbno, len)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(xg, agbno, len)) DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); @@ -1949,43 +2001,45 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, - bool crosslinked), - TP_ARGS(pag, agbno, len, crosslinked), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len, bool crosslinked), + TP_ARGS(xg, agbno, len, crosslinked), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) __field(bool, crosslinked) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->crosslinked = crosslinked; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->crosslinked ? 1 : 0) ); #define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ DEFINE_EVENT(xrep_reap_find_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ - bool crosslinked), \ - TP_ARGS(pag, agbno, len, crosslinked)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len, bool crosslinked), \ + TP_ARGS(xg, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); -DECLARE_EVENT_CLASS(xrep_rmap_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - uint64_t owner, uint64_t offset, unsigned int flags), - TP_ARGS(mp, agno, agbno, len, owner, offset, flags), +TRACE_EVENT(xrep_ibt_walk_rmap, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1996,13 +2050,13 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->len = len; - __entry->owner = owner; - __entry->offset = offset; - __entry->flags = flags; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; ), TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2013,19 +2067,11 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __entry->offset, __entry->flags) ); -#define DEFINE_REPAIR_RMAP_EVENT(name) \ -DEFINE_EVENT(xrep_rmap_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ - uint64_t owner, uint64_t offset, unsigned int flags), \ - TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_alloc_rec_incore *rec), - TP_ARGS(mp, agno, rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2033,8 +2079,8 @@ TRACE_EVENT(xrep_abt_found, __field(xfs_extlen_t, blockcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startblock = rec->ar_startblock; __entry->blockcount = rec->ar_blockcount; ), @@ -2046,9 +2092,9 @@ TRACE_EVENT(xrep_abt_found, ) TRACE_EVENT(xrep_ibt_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_inobt_rec_incore *rec), - TP_ARGS(mp, agno, rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2059,8 +2105,8 @@ TRACE_EVENT(xrep_ibt_found, __field(uint64_t, freemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = rec->ir_startino; __entry->holemask = rec->ir_holemask; __entry->count = rec->ir_count; @@ -2078,28 +2124,33 @@ TRACE_EVENT(xrep_ibt_found, ) TRACE_EVENT(xrep_refc_found, - TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), - TP_ARGS(pag, rec), + TP_PROTO(const struct xfs_group *xg, + const struct xfs_refcount_irec *rec), + TP_ARGS(xg, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) + __field(enum xfs_group_type, type) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->agno = xg->xg_gno; + __entry->type = xg->xg_type; __entry->domain = rec->rc_domain; __entry->startblock = rec->rc_startblock; __entry->blockcount = rec->rc_blockcount; __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) @@ -2138,9 +2189,8 @@ TRACE_EVENT(xrep_bmap_found, ); TRACE_EVENT(xrep_rmap_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - const struct xfs_rmap_irec *rec), - TP_ARGS(mp, agno, rec), + TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2151,8 +2201,8 @@ TRACE_EVENT(xrep_rmap_found, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = rec->rm_startblock; __entry->len = rec->rm_blockcount; __entry->owner = rec->rm_owner; @@ -2170,9 +2220,9 @@ TRACE_EVENT(xrep_rmap_found, ); TRACE_EVENT(xrep_findroot_block, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, uint32_t magic, uint16_t level), - TP_ARGS(mp, agno, agbno, magic, level), + TP_ARGS(pag, agbno, magic, level), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2181,8 +2231,8 @@ TRACE_EVENT(xrep_findroot_block, __field(uint16_t, level) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->magic = magic; __entry->level = level; @@ -2195,10 +2245,10 @@ TRACE_EVENT(xrep_findroot_block, __entry->level) ) TRACE_EVENT(xrep_calc_ag_resblks, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + TP_PROTO(const struct xfs_perag *pag, xfs_agino_t icount, + xfs_agblock_t aglen, xfs_agblock_t freelen, xfs_agblock_t usedlen), - TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_ARGS(pag, icount, aglen, freelen, usedlen), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2208,8 +2258,8 @@ TRACE_EVENT(xrep_calc_ag_resblks, __field(xfs_agblock_t, usedlen) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->icount = icount; __entry->aglen = aglen; __entry->freelen = freelen; @@ -2224,10 +2274,10 @@ TRACE_EVENT(xrep_calc_ag_resblks, __entry->usedlen) ) TRACE_EVENT(xrep_calc_ag_resblks_btsize, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, - xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), - TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t bnobt_sz, + xfs_agblock_t inobt_sz, xfs_agblock_t rmapbt_sz, + xfs_agblock_t refcbt_sz), + TP_ARGS(pag, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2237,8 +2287,8 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __field(xfs_agblock_t, refcbt_sz) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bnobt_sz = bnobt_sz; __entry->inobt_sz = inobt_sz; __entry->rmapbt_sz = rmapbt_sz; @@ -2252,6 +2302,32 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->rmapbt_sz, __entry->refcbt_sz) ) + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize, + TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz), + TP_ARGS(mp, rgno, usedlen, rmapbt_sz), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, usedlen) + __field(xfs_rgblock_t, rmapbt_sz) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rgno; + __entry->usedlen = usedlen; + __entry->rmapbt_sz = rmapbt_sz; + ), + TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->usedlen, + __entry->rmapbt_sz) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xrep_reset_counters, TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), TP_ARGS(mp, fsc), @@ -2278,10 +2354,9 @@ TRACE_EVENT(xrep_reset_counters, ) DECLARE_EVENT_CLASS(xrep_newbt_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - int64_t owner), - TP_ARGS(mp, agno, agbno, len, owner), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t len, int64_t owner), + TP_ARGS(pag, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2290,8 +2365,8 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class, __field(int64_t, owner) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; __entry->owner = owner; @@ -2305,10 +2380,9 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class, ); #define DEFINE_NEWBT_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_newbt_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ - int64_t owner), \ - TP_ARGS(mp, agno, agbno, len, owner)) + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + xfs_extlen_t len, int64_t owner), \ + TP_ARGS(pag, agbno, len, owner)) DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); @@ -2596,7 +2670,7 @@ TRACE_EVENT(xrep_cow_replace_mapping, ); TRACE_EVENT(xrep_cow_free_staging, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t blockcount), TP_ARGS(pag, agbno, blockcount), TP_STRUCT__entry( @@ -2606,8 +2680,8 @@ TRACE_EVENT(xrep_cow_free_staging, __field(xfs_extlen_t, blockcount) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->blockcount = blockcount; ), @@ -2652,11 +2726,12 @@ DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); TRACE_EVENT(xrep_rmap_live_update, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + TP_PROTO(const struct xfs_group *xg, unsigned int op, const struct xfs_rmap_update_params *p), - TP_ARGS(mp, agno, op, p), + TP_ARGS(xg, op, p), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(unsigned int, op) __field(xfs_agblock_t, agbno) @@ -2666,8 +2741,9 @@ TRACE_EVENT(xrep_rmap_live_update, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->op = op; __entry->agbno = p->startblock; __entry->len = p->blockcount; @@ -2676,10 +2752,12 @@ TRACE_EVENT(xrep_rmap_live_update, if (p->unwritten) __entry->flags |= XFS_RMAP_UNWRITTEN; ), - TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x op %d %sbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __entry->op, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->owner, @@ -3313,7 +3391,7 @@ DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild); DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork); TRACE_EVENT(xrep_iunlink_visit, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t bucket_agino, struct xfs_inode *ip), TP_ARGS(pag, bucket, bucket_agino, ip), TP_STRUCT__entry( @@ -3326,9 +3404,9 @@ TRACE_EVENT(xrep_iunlink_visit, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; - __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino); + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = XFS_INO_TO_AGINO(pag_mount(pag), ip->i_ino); __entry->bucket = bucket; __entry->bucket_agino = bucket_agino; __entry->prev_agino = ip->i_prev_unlinked; @@ -3403,7 +3481,7 @@ TRACE_EVENT(xrep_iunlink_reload_ondisk, ); TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t prev_agino, xfs_agino_t next_agino), TP_ARGS(pag, bucket, prev_agino, next_agino), TP_STRUCT__entry( @@ -3414,8 +3492,8 @@ TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->prev_agino = prev_agino; __entry->next_agino = next_agino; @@ -3429,7 +3507,7 @@ TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, ); DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t prev_agino, xfs_agino_t next_agino), TP_ARGS(pag, bucket, prev_agino, next_agino), TP_STRUCT__entry( @@ -3440,8 +3518,8 @@ DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->prev_agino = prev_agino; __entry->next_agino = next_agino; @@ -3455,7 +3533,7 @@ DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, ); #define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \ DEFINE_EVENT(xrep_iunlink_resolve_class, name, \ - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \ + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, \ xfs_agino_t prev_agino, xfs_agino_t next_agino), \ TP_ARGS(pag, bucket, prev_agino, next_agino)) DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached); @@ -3516,7 +3594,7 @@ TRACE_EVENT(xrep_iunlink_relink_prev, ); TRACE_EVENT(xrep_iunlink_add_to_bucket, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t agino, xfs_agino_t curr_head), TP_ARGS(pag, bucket, agino, curr_head), TP_STRUCT__entry( @@ -3527,8 +3605,8 @@ TRACE_EVENT(xrep_iunlink_add_to_bucket, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->agino = agino; __entry->next_agino = curr_head; @@ -3542,7 +3620,7 @@ TRACE_EVENT(xrep_iunlink_add_to_bucket, ); TRACE_EVENT(xrep_iunlink_commit_bucket, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t old_agino, xfs_agino_t agino), TP_ARGS(pag, bucket, old_agino, agino), TP_STRUCT__entry( @@ -3553,8 +3631,8 @@ TRACE_EVENT(xrep_iunlink_commit_bucket, __field(xfs_agino_t, agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->old_agino = old_agino; __entry->agino = agino; @@ -3572,6 +3650,191 @@ DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path); DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption); DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_lookup); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link); + +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xrep_rtbitmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end), + TP_ARGS(mp, start, end), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rtxnum_t, start) + __field(xfs_rtxnum_t, end) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->start = start; + __entry->end = end; + ), + TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->start, + __entry->end) +); +#define DEFINE_REPAIR_RGBITMAP_EVENT(name) \ +DEFINE_EVENT(xrep_rtbitmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \ + xfs_rtxnum_t end), \ + TP_ARGS(mp, start, end)) +DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free); +DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free_bulk); + +TRACE_EVENT(xrep_rtbitmap_or, + TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff, + xfs_rtword_t mask, xfs_rtword_t word), + TP_ARGS(mp, wordoff, mask, word), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(unsigned long long, wordoff) + __field(unsigned int, mask) + __field(unsigned int, word) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->wordoff = wordoff; + __entry->mask = mask; + __entry->word = word; + ), + TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->wordoff, + __entry->mask, + __entry->word) +); + +TRACE_EVENT(xrep_rtbitmap_load, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff, + xfs_rtxnum_t rtx, xfs_rtxnum_t len), + TP_ARGS(rtg, rbmoff, rtx, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_fileoff_t, rbmoff) + __field(xfs_rtxnum_t, rtx) + __field(xfs_rtxnum_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rtdev = rtg_mount(rtg)->m_rtdev_targp->bt_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->rbmoff = rbmoff; + __entry->rtx = rtx; + __entry->len = len; + ), + TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->rgno, + __entry->rbmoff, + __entry->rtx, + __entry->len) +); + +TRACE_EVENT(xrep_rtbitmap_load_words, + TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff, + unsigned long long wordoff, unsigned int wordcnt), + TP_ARGS(mp, rbmoff, wordoff, wordcnt), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_fileoff_t, rbmoff) + __field(unsigned long long, wordoff) + __field(unsigned int, wordcnt) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->rbmoff = rbmoff; + __entry->wordoff = wordoff; + __entry->wordcnt = wordcnt; + ), + TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->rbmoff, + __entry->wordoff, + __entry->wordcnt) +); + +TRACE_EVENT(xrep_rtbitmap_load_word, + TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff, + unsigned int bit, xfs_rtword_t ondisk_word, + xfs_rtword_t xfile_word, xfs_rtword_t word_mask), + TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(unsigned long long, wordoff) + __field(unsigned int, bit) + __field(xfs_rtword_t, ondisk_word) + __field(xfs_rtword_t, xfile_word) + __field(xfs_rtword_t, word_mask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->wordoff = wordoff; + __entry->bit = bit; + __entry->ondisk_word = ondisk_word; + __entry->xfile_word = xfile_word; + __entry->word_mask = word_mask; + ), + TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->wordoff, + __entry->bit, + __entry->ondisk_word, + __entry->ondisk_word & __entry->word_mask, + __entry->xfile_word, + __entry->xfile_word & ~__entry->word_mask, + (__entry->xfile_word & ~__entry->word_mask) | + (__entry->ondisk_word & __entry->word_mask), + __entry->word_mask) +); + +TRACE_EVENT(xrep_rtrmap_found, + TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec), + TP_ARGS(mp, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->rgbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; + ), + TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->rgbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#endif /* CONFIG_XFS_RT */ + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 559a3a577097..63151feb9c3f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" @@ -19,6 +19,9 @@ #include "xfs_reflink.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_icache.h" +#include "xfs_zone_alloc.h" +#include "xfs_rtgroup.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -76,6 +79,26 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +static void +xfs_ioend_put_open_zones( + struct iomap_ioend *ioend) +{ + struct iomap_ioend *tmp; + + /* + * Put the open zone for all ioends merged into this one (if any). + */ + list_for_each_entry(tmp, &ioend->io_list, io_list) + xfs_open_zone_put(tmp->io_private); + + /* + * The main ioend might not have an open zone if the submission failed + * before xfs_zone_alloc_and_submit got called. + */ + if (ioend->io_private) + xfs_open_zone_put(ioend->io_private); +} + /* * IO write completion. */ @@ -85,6 +108,7 @@ xfs_end_ioend( { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_mount *mp = ip->i_mount; + bool is_zoned = xfs_is_zoned_inode(ip); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -114,10 +138,11 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { + ASSERT(!is_zoned); xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, - offset + size); + offset + size, NULL); } goto done; } @@ -125,14 +150,21 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (is_zoned) + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector, + ioend->io_private, NULLFSBLOCK); + else if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - if (!error && xfs_ioend_is_append(ioend)) - error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); + if (!error && + !(ioend->io_flags & IOMAP_IOEND_DIRECT) && + xfs_ioend_is_append(ioend)) + error = xfs_setfilesize(ip, offset, size); done: + if (is_zoned) + xfs_ioend_put_open_zones(ioend); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } @@ -175,17 +207,27 @@ xfs_end_io( } } -STATIC void +void xfs_end_bio( struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; unsigned long flags; + /* + * For Appends record the actually written block number and set the + * boundary flag if needed. + */ + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) { + ioend->io_sector = bio->bi_iter.bi_sector; + xfs_mark_rtg_boundary(ioend); + } + spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, &ip->i_ioend_work)); list_add_tail(&ioend->io_list, &ip->i_ioend_list); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); @@ -394,11 +436,31 @@ allocate_blocks: return 0; } +static bool +xfs_ioend_needs_wq_completion( + struct iomap_ioend *ioend) +{ + /* Changing inode size requires a transaction. */ + if (xfs_ioend_is_append(ioend)) + return true; + + /* Extent manipulation requires a transaction. */ + if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)) + return true; + + /* Page cache invalidation cannot be done in irq context. */ + if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) + return true; + + return false; +} + static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -409,7 +471,7 @@ xfs_prepare_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -417,10 +479,13 @@ xfs_prepare_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_needs_wq_completion(ioend)) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -457,12 +522,107 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + folio_size(folio), NULL); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, + .discard_folio = xfs_discard_folio, +}; + +struct xfs_zoned_writepage_ctx { + struct iomap_writepage_ctx ctx; + struct xfs_open_zone *open_zone; +}; + +static inline struct xfs_zoned_writepage_ctx * +XFS_ZWPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx); +} + +static int +xfs_zoned_map_blocks( + struct iomap_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int len) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_filblks_t count_fsb; + struct xfs_bmbt_irec imap, del; + struct xfs_iext_cursor icur; + + if (xfs_is_shutdown(mp)) + return -EIO; + + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + + /* + * All dirty data must be covered by delalloc extents. But truncate can + * remove delalloc extents underneath us or reduce their size. + * Returning a hole tells iomap to not write back any data from this + * range, which is the right thing to do in that case. + * + * Otherwise just tell iomap to treat ranges previously covered by a + * delalloc extent as mapped. The actual block allocation will be done + * just before submitting the bio. + * + * This implies we never map outside folios that are locked or marked + * as under writeback, and thus there is no need check the fork sequence + * count here. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + imap.br_startoff = end_fsb; /* fake a hole past EOF */ + if (imap.br_startoff > offset_fsb) { + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0); + return 0; + } + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + count_fsb = end_fsb - offset_fsb; + + del = imap; + xfs_trim_extent(&del, offset_fsb, count_fsb); + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del, + XFS_BMAPI_REMAP); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + wpc->iomap.type = IOMAP_MAPPED; + wpc->iomap.flags = IOMAP_F_DIRTY; + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev; + wpc->iomap.offset = offset; + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb); + wpc->iomap.flags = IOMAP_F_ANON_WRITE; + + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length); + return 0; +} + +static int +xfs_zoned_submit_ioend( + struct iomap_writepage_ctx *wpc, + int status) +{ + wpc->ioend->io_bio.bi_end_io = xfs_end_bio; + if (status) + return status; + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + return 0; +} + +static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { + .map_blocks = xfs_zoned_map_blocks, + .submit_ioend = xfs_zoned_submit_ioend, .discard_folio = xfs_discard_folio, }; @@ -471,10 +631,25 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + if (xfs_is_zoned_inode(ip)) { + struct xfs_zoned_writepage_ctx xc = { }; + int error; - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); + error = iomap_writepages(mapping, wbc, &xc.ctx, + &xfs_zoned_writeback_ops); + if (xc.open_zone) + xfs_open_zone_put(xc.open_zone); + return error; + } else { + struct xfs_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc.ctx, + &xfs_writeback_ops); + } } STATIC int @@ -528,12 +703,44 @@ xfs_vm_readahead( } static int -xfs_iomap_swapfile_activate( +xfs_vm_swap_activate( struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { - sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + /* + * Direct the swap code to the correct block device when this file + * sits on the RT device. + */ + sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, &xfs_read_iomap_ops); } @@ -549,11 +756,11 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e0bd68419764..5a7a0f1a0b49 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,6 +9,7 @@ extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; -int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +void xfs_end_bio(struct bio *bio); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 24fb12986a56..319004bf089f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -305,11 +305,6 @@ xfs_attr3_root_inactive( XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); if (error) return error; - error = bp->b_error; - if (error) { - xfs_trans_brelse(*trans, bp); - return error; - } xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 7db386304875..379b48d015d2 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -114,7 +114,8 @@ xfs_attr_shortform_list( * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->count * sizeof(*sbuf); - sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL); + sbp = sbuf = kmalloc(sbsize, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); /* * Scan the attribute list for the rest of the entries, storing diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index fe21c76f75b8..2a736d10eafb 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -18,42 +18,36 @@ xfs_rw_bdev( enum req_op op) { - unsigned int is_vmalloc = is_vmalloc_addr(data); - unsigned int left = count; + unsigned int done = 0, added; int error; struct bio *bio; - if (is_vmalloc && op == REQ_OP_WRITE) - flush_kernel_vmap_range(data, count); + op |= REQ_META | REQ_SYNC; + if (!is_vmalloc_addr(data)) + return bdev_rw_virt(bdev, sector, data, count, op); - bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, - GFP_KERNEL); + bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL); bio->bi_iter.bi_sector = sector; do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - unsigned int len = min_t(unsigned, left, PAGE_SIZE - off); - - while (bio_add_page(bio, page, len, off) != len) { + added = bio_add_vmalloc_chunk(bio, data + done, count - done); + if (!added) { struct bio *prev = bio; - bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), + bio = bio_alloc(prev->bi_bdev, + bio_max_vecs(count - done), prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_chain(prev, bio); - submit_bio(prev); } - - data += len; - left -= len; - } while (left > 0); + done += added; + } while (done < count); error = submit_bio_wait(bio); bio_put(bio); - if (is_vmalloc && op == REQ_OP_READ) + if (op == REQ_OP_READ) invalidate_kernel_vmap_range(data, count); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 35a8c1b8b3cb..646c515ee355 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -77,6 +77,11 @@ xfs_bui_item_size( *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); } +unsigned int xfs_bui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_bui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given bui log item. We use only 1 iovec, and we point that @@ -168,6 +173,11 @@ xfs_bud_item_size( *nbytes += sizeof(struct xfs_bud_log_format); } +unsigned int xfs_bud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_bud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given bud log item. We use only 1 iovec, and we point that @@ -318,14 +328,16 @@ xfs_bmap_update_create_done( return &budp->bud_item; } -/* Take a passive ref to the AG containing the space we're mapping. */ +/* Take a passive ref to the group containing the space we're mapping. */ static inline void xfs_bmap_update_get_group( struct xfs_mount *mp, struct xfs_bmap_intent *bi) { + enum xfs_group_type type = XG_TYPE_AG; + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) - return; + type = XG_TYPE_RTG; /* * Bump the intent count on behalf of the deferred rmap and refcount @@ -334,7 +346,8 @@ xfs_bmap_update_get_group( * intent drops the intent count, ensuring that the intent count * remains nonzero across the transaction roll. */ - bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock); + bi->bi_group = xfs_group_intent_get(mp, bi->bi_bmap.br_startblock, + type); } /* Add this deferred BUI to the transaction. */ @@ -343,8 +356,6 @@ xfs_bmap_defer_add( struct xfs_trans *tp, struct xfs_bmap_intent *bi) { - trace_xfs_bmap_defer(bi); - xfs_bmap_update_get_group(tp->t_mountp, bi); /* @@ -357,18 +368,9 @@ xfs_bmap_defer_add( */ if (bi->bi_type == XFS_BMAP_MAP) bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount; - xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); -} - -/* Release a passive AG ref after finishing mapping work. */ -static inline void -xfs_bmap_update_put_group( - struct xfs_bmap_intent *bi) -{ - if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) - return; - xfs_perag_intent_put(bi->bi_pag); + trace_xfs_bmap_defer(bi); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); } /* Cancel a deferred bmap update. */ @@ -381,7 +383,7 @@ xfs_bmap_update_cancel_item( if (bi->bi_type == XFS_BMAP_MAP) bi->bi_owner->i_delayed_blks -= bi->bi_bmap.br_blockcount; - xfs_bmap_update_put_group(bi); + xfs_group_intent_put(bi->bi_group); kmem_cache_free(xfs_bmap_intent_cache, bi); } diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 6fee6a508343..b42fee06899d 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -72,4 +72,7 @@ struct xfs_bmap_intent; void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); +unsigned int xfs_bui_log_space(unsigned int nr); +unsigned int xfs_bud_log_space(void); + #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4719ec90029c..06ca11731e43 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -29,6 +29,8 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" /* Kernel only BMAP related definitions and functions */ @@ -41,16 +43,12 @@ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { if (XFS_IS_REALTIME_INODE(ip)) - return XFS_FSB_TO_BB(ip->i_mount, fsb); + return xfs_rtb_to_daddr(ip->i_mount, fsb); return XFS_FSB_TO_DADDR(ip->i_mount, fsb); } /* * Routine to zero an extent on disk allocated to the specific inode. - * - * The VFS functions take a linearised filesystem block offset, so we have to - * convert the sparse xfs fsb to the right format first. - * VFS types are real funky, too. */ int xfs_zero_extent( @@ -58,15 +56,10 @@ xfs_zero_extent( xfs_fsblock_t start_fsb, xfs_off_t count_fsb) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); - sector_t block = XFS_BB_TO_FSBT(mp, sector); - - return blkdev_issue_zeroout(target->bt_bdev, - block << (mp->m_super->s_blocksize_bits - 9), - count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_KERNEL, 0); + return blkdev_issue_zeroout(xfs_inode_buftarg(ip)->bt_bdev, + xfs_fsb_to_db(ip, start_fsb), + XFS_FSB_TO_BB(ip->i_mount, count_fsb), + GFP_KERNEL, 0); } /* @@ -111,7 +104,7 @@ xfs_bmap_count_blocks( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; - xfs_extlen_t btblocks = 0; + xfs_filblks_t btblocks = 0; int error; *nextents = 0; @@ -444,7 +437,8 @@ xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, - xfs_off_t end_byte) + xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -475,7 +469,21 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + if (xfs_is_zoned_inode(ip) && ac) { + /* + * In a zoned buffered write context we need to return + * the punched delalloc allocations to the allocation + * context. This allows reusing them in the following + * iomap iterations. + */ + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, XFS_BMAPI_REMAP); + ac->reserved_blocks += del.br_blockcount; + } else { + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, 0); + } + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -540,16 +548,20 @@ xfs_can_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); if (xfs_inode_has_bigrtalloc(ip)) - end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); + end_fsb = xfs_fileoff_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; /* - * Check if there is an post-EOF extent to free. + * Check if there is an post-EOF extent to free. If there are any + * delalloc blocks attached to the inode (data fork delalloc + * reservations or CoW extents of any kind), we need to free them so + * that inactivation doesn't fail to erase them. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap)) + if (ip->i_delayed_blks || + xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap)) found_blocks = true; xfs_iunlock(ip, XFS_ILOCK_SHARED); return found_blocks; @@ -586,7 +598,7 @@ xfs_free_eofblocks( if (ip->i_delayed_blks) { xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), - LLONG_MAX); + LLONG_MAX, NULL); } xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -829,7 +841,8 @@ int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; @@ -858,8 +871,8 @@ xfs_free_file_space( /* We can only free complete realtime extents. */ if (xfs_inode_has_bigrtalloc(ip)) { - startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); - endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); + startoffset_fsb = xfs_fileoff_roundup_rtx(mp, startoffset_fsb); + endoffset_fsb = xfs_fileoff_rounddown_rtx(mp, endoffset_fsb); } /* @@ -884,7 +897,7 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = xfs_zero_range(ip, offset, len, NULL); + error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; @@ -972,7 +985,8 @@ int xfs_collapse_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -985,7 +999,7 @@ xfs_collapse_file_space( trace_xfs_collapse_file_space(ip); - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; @@ -1527,6 +1541,18 @@ xfs_swap_extents( goto out_unlock; } + /* + * The rmapbt implementation is unable to resume a swapext operation + * after a crash if the allocation unit size is larger than a block. + * This (deprecated) interface will not be upgraded to handle this + * situation. Defragmentation must be performed with the commit range + * ioctl. + */ + if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(ip->i_mount)) { + error = -EOPNOTSUPP; + goto out_unlock; + } + error = xfs_qm_dqattach(ip); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index b29760d36e1a..c477b3361630 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -15,6 +15,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +struct xfs_zone_alloc_ctx; #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); @@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, - xfs_off_t start_byte, xfs_off_t end_byte); + xfs_off_t start_byte, xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ @@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b536..8af83bd161f9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -22,17 +22,13 @@ #include "xfs_error.h" #include "xfs_ag.h" #include "xfs_buf_mem.h" +#include "xfs_notify_failure.h" struct kmem_cache *xfs_buf_cache; /* * Locking orders * - * xfs_buf_ioacct_inc: - * xfs_buf_ioacct_dec: - * b_sema (caller holds) - * b_lock - * * xfs_buf_stale: * b_sema (caller holds) * b_lock @@ -40,8 +36,7 @@ struct kmem_cache *xfs_buf_cache; * * xfs_buf_rele: * b_lock - * pag_buf_lock - * lru_lock + * lru_lock * * xfs_buftarg_drain_rele * lru_lock @@ -52,95 +47,14 @@ struct kmem_cache *xfs_buf_cache; * b_lock (trylock due to inversion) */ -static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); - -static inline int -xfs_buf_submit( - struct xfs_buf *bp) -{ - return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); -} +static void xfs_buf_submit(struct xfs_buf *bp); +static int xfs_buf_iowait(struct xfs_buf *bp); static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) { return bp->b_rhash_key == XFS_BUF_DADDR_NULL; } -static inline int -xfs_buf_is_vmapped( - struct xfs_buf *bp) -{ - /* - * Return true if the buffer is vmapped. - * - * b_addr is null if the buffer is not mapped, but the code is clever - * enough to know it doesn't have to map a single page, so the check has - * to be both for b_addr and bp->b_page_count > 1. - */ - return bp->b_addr && bp->b_page_count > 1; -} - -static inline int -xfs_buf_vmap_len( - struct xfs_buf *bp) -{ - return (bp->b_page_count * PAGE_SIZE); -} - -/* - * Bump the I/O in flight count on the buftarg if we haven't yet done so for - * this buffer. The count is incremented once per buffer (per hold cycle) - * because the corresponding decrement is deferred to buffer release. Buffers - * can undergo I/O multiple times in a hold-release cycle and per buffer I/O - * tracking adds unnecessary overhead. This is used for sychronization purposes - * with unmount (see xfs_buftarg_drain()), so all we really need is a count of - * in-flight buffers. - * - * Buffers that are never released (e.g., superblock, iclog buffers) must set - * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count - * never reaches zero and unmount hangs indefinitely. - */ -static inline void -xfs_buf_ioacct_inc( - struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_NO_IOACCT) - return; - - ASSERT(bp->b_flags & XBF_ASYNC); - spin_lock(&bp->b_lock); - if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { - bp->b_state |= XFS_BSTATE_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); - } - spin_unlock(&bp->b_lock); -} - -/* - * Clear the in-flight state on a buffer about to be released to the LRU or - * freed and unaccount from the buftarg. - */ -static inline void -__xfs_buf_ioacct_dec( - struct xfs_buf *bp) -{ - lockdep_assert_held(&bp->b_lock); - - if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { - bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); - } -} - -static inline void -xfs_buf_ioacct_dec( - struct xfs_buf *bp) -{ - spin_lock(&bp->b_lock); - __xfs_buf_ioacct_dec(bp); - spin_unlock(&bp->b_lock); -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -164,150 +78,24 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - /* - * Once the buffer is marked stale and unlocked, a subsequent lookup - * could reset b_flags. There is no guarantee that the buffer is - * unaccounted (released to LRU) before that occurs. Drop in-flight - * status now to preserve accounting consistency. - */ spin_lock(&bp->b_lock); - __xfs_buf_ioacct_dec(bp); - atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) - atomic_dec(&bp->b_hold); + bp->b_hold--; - ASSERT(atomic_read(&bp->b_hold) >= 1); + ASSERT(bp->b_hold >= 1); spin_unlock(&bp->b_lock); } -static int -xfs_buf_get_maps( - struct xfs_buf *bp, - int map_count) -{ - ASSERT(bp->b_maps == NULL); - bp->b_map_count = map_count; - - if (map_count == 1) { - bp->b_maps = &bp->__b_map; - return 0; - } - - bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - if (!bp->b_maps) - return -ENOMEM; - return 0; -} - -/* - * Frees b_pages if it was allocated. - */ -static void -xfs_buf_free_maps( - struct xfs_buf *bp) -{ - if (bp->b_maps != &bp->__b_map) { - kfree(bp->b_maps); - bp->b_maps = NULL; - } -} - -static int -_xfs_buf_alloc( - struct xfs_buftarg *target, - struct xfs_buf_map *map, - int nmaps, - xfs_buf_flags_t flags, - struct xfs_buf **bpp) -{ - struct xfs_buf *bp; - int error; - int i; - - *bpp = NULL; - bp = kmem_cache_zalloc(xfs_buf_cache, - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - - /* - * We don't want certain flags to appear in b_flags unless they are - * specifically set by later operations on the buffer. - */ - flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - - atomic_set(&bp->b_hold, 1); - atomic_set(&bp->b_lru_ref, 1); - init_completion(&bp->b_iowait); - INIT_LIST_HEAD(&bp->b_lru); - INIT_LIST_HEAD(&bp->b_list); - INIT_LIST_HEAD(&bp->b_li_list); - sema_init(&bp->b_sema, 0); /* held, no waiters */ - spin_lock_init(&bp->b_lock); - bp->b_target = target; - bp->b_mount = target->bt_mount; - bp->b_flags = flags; - - /* - * Set length and io_length to the same value initially. - * I/O routines should use io_length, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ - error = xfs_buf_get_maps(bp, nmaps); - if (error) { - kmem_cache_free(xfs_buf_cache, bp); - return error; - } - - bp->b_rhash_key = map[0].bm_bn; - bp->b_length = 0; - for (i = 0; i < nmaps; i++) { - bp->b_maps[i].bm_bn = map[i].bm_bn; - bp->b_maps[i].bm_len = map[i].bm_len; - bp->b_length += map[i].bm_len; - } - - atomic_set(&bp->b_pin_count, 0); - init_waitqueue_head(&bp->b_waiters); - - XFS_STATS_INC(bp->b_mount, xb_create); - trace_xfs_buf_init(bp, _RET_IP_); - - *bpp = bp; - return 0; -} - -static void -xfs_buf_free_pages( - struct xfs_buf *bp) -{ - uint i; - - ASSERT(bp->b_flags & _XBF_PAGES); - - if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr, bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - if (bp->b_pages[i]) - __free_page(bp->b_pages[i]); - } - mm_account_reclaimed_pages(bp->b_page_count); - - if (bp->b_pages != bp->b_page_array) - kfree(bp->b_pages); - bp->b_pages = NULL; - bp->b_flags &= ~_XBF_PAGES; -} - static void xfs_buf_free_callback( struct callback_head *cb) { struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); - xfs_buf_free_maps(bp); + if (bp->b_maps != &bp->__b_map) + kfree(bp->b_maps); kmem_cache_free(xfs_buf_cache, bp); } @@ -315,154 +103,219 @@ static void xfs_buf_free( struct xfs_buf *bp) { + unsigned int size = BBTOB(bp->b_length); + + might_sleep(); trace_xfs_buf_free(bp, _RET_IP_); ASSERT(list_empty(&bp->b_lru)); - if (xfs_buftarg_is_mem(bp->b_target)) - xmbuf_unmap_page(bp); - else if (bp->b_flags & _XBF_PAGES) - xfs_buf_free_pages(bp); + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); + + if (is_vmalloc_addr(bp->b_addr)) + vfree(bp->b_addr); else if (bp->b_flags & _XBF_KMEM) kfree(bp->b_addr); + else + folio_put(virt_to_folio(bp->b_addr)); call_rcu(&bp->b_rcu, xfs_buf_free_callback); } static int xfs_buf_alloc_kmem( - struct xfs_buf *bp, - xfs_buf_flags_t flags) + struct xfs_buf *bp, + size_t size, + gfp_t gfp_mask) { - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; - size_t size = BBTOB(bp->b_length); - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; + ASSERT(is_power_of_2(size)); + ASSERT(size < PAGE_SIZE); - bp->b_addr = kmalloc(size, gfp_mask); + bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); if (!bp->b_addr) return -ENOMEM; - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != - ((unsigned long)bp->b_addr & PAGE_MASK)) { - /* b_addr spans two pages - use alloc_page instead */ + /* + * Slab guarantees that we get back naturally aligned allocations for + * power of two sizes. Keep this check as the canary in the coal mine + * if anything changes in slab. + */ + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { kfree(bp->b_addr); bp->b_addr = NULL; return -ENOMEM; } - bp->b_offset = offset_in_page(bp->b_addr); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; bp->b_flags |= _XBF_KMEM; + trace_xfs_buf_backing_kmem(bp, _RET_IP_); return 0; } +/* + * Allocate backing memory for a buffer. + * + * For tmpfs-backed buffers used by in-memory btrees this directly maps the + * tmpfs page cache folios. + * + * For real file system buffers there are three different kinds backing memory: + * + * The first type backs the buffer by a kmalloc allocation. This is done for + * less than PAGE_SIZE allocations to avoid wasting memory. + * + * The second type is a single folio buffer - this may be a high order folio or + * just a single page sized folio, but either way they get treated the same way + * by the rest of the code - the buffer memory spans a single contiguous memory + * region that we don't have to map and unmap to access the data directly. + * + * The third type of buffer is the vmalloc()d buffer. This provides the buffer + * with the required contiguous memory region but backed by discontiguous + * physical pages. + */ static int -xfs_buf_alloc_pages( +xfs_buf_alloc_backing_mem( struct xfs_buf *bp, xfs_buf_flags_t flags) { + size_t size = BBTOB(bp->b_length); gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; - long filled = 0; + struct folio *folio; - if (flags & XBF_READ_AHEAD) - gfp_mask |= __GFP_NORETRY; - - /* Make sure that we have a page list */ - bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); - if (bp->b_page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, - gfp_mask); - if (!bp->b_pages) - return -ENOMEM; - } - bp->b_flags |= _XBF_PAGES; + if (xfs_buftarg_is_mem(bp->b_target)) + return xmbuf_map_backing_mem(bp); /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) gfp_mask |= __GFP_ZERO; + if (flags & XBF_READ_AHEAD) + gfp_mask |= __GFP_NORETRY; + /* - * Bulk filling of pages can take multiple calls. Not filling the entire - * array is not an allocation failure, so don't back off if we get at - * least one extra page. + * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that + * is properly aligned. The slab allocator now guarantees an aligned + * allocation for all power of two sizes, which matches most of the + * smaller than PAGE_SIZE buffers used by XFS. */ - for (;;) { - long last = filled; - - filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, - bp->b_pages); - if (filled == bp->b_page_count) { - XFS_STATS_INC(bp->b_mount, xb_page_found); - break; - } + if (size < PAGE_SIZE && is_power_of_2(size)) + return xfs_buf_alloc_kmem(bp, size, gfp_mask); - if (filled != last) - continue; + /* + * Don't bother with the retry loop for single PAGE allocations: vmalloc + * won't do any better. + */ + if (size <= PAGE_SIZE) + gfp_mask |= __GFP_NOFAIL; - if (flags & XBF_READ_AHEAD) { - xfs_buf_free_pages(bp); + /* + * Optimistically attempt a single high order folio allocation for + * larger than PAGE_SIZE buffers. + * + * Allocating a high order folio makes the assumption that buffers are a + * power-of-2 size, matching the power-of-2 folios sizes available. + * + * The exception here are user xattr data buffers, which can be arbitrarily + * sized up to 64kB plus structure metadata, skip straight to the vmalloc + * path for them instead of wasting memory here. + */ + if (size > PAGE_SIZE) { + if (!is_power_of_2(size)) + goto fallback; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + gfp_mask |= __GFP_NORETRY; + } + folio = folio_alloc(gfp_mask, get_order(size)); + if (!folio) { + if (size <= PAGE_SIZE) return -ENOMEM; - } + trace_xfs_buf_backing_fallback(bp, _RET_IP_); + goto fallback; + } + bp->b_addr = folio_address(folio); + trace_xfs_buf_backing_folio(bp, _RET_IP_); + return 0; +fallback: + for (;;) { + bp->b_addr = __vmalloc(size, gfp_mask); + if (bp->b_addr) + break; + if (flags & XBF_READ_AHEAD) + return -ENOMEM; XFS_STATS_INC(bp->b_mount, xb_page_retries); memalloc_retry_wait(gfp_mask); } + + trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); return 0; } -/* - * Map buffer into kernel address-space if necessary. - */ -STATIC int -_xfs_buf_map_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) +static int +xfs_buf_alloc( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { - ASSERT(bp->b_flags & _XBF_PAGES); - if (bp->b_page_count == 1) { - /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]); - } else if (flags & XBF_UNMAPPED) { - bp->b_addr = NULL; - } else { - int retried = 0; - unsigned nofs_flag; + struct xfs_buf *bp; + int error; + int i; - /* - * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations - * from the same call site that can be run from both above and - * below memory reclaim causes lockdep false positives. Hence we - * always need to force this allocation to nofs context because - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to - * prevent false positive lockdep reports. - * - * XXX(dgc): I think dquot reclaim is the only place we can get - * to this function from memory reclaim context now. If we fix - * that like we've fixed inode reclaim to avoid writeback from - * reclaim, this nofs wrapping can go away. - */ - nofs_flag = memalloc_nofs_save(); - do { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1); - if (bp->b_addr) - break; - vm_unmap_aliases(); - } while (retried++ <= 1); - memalloc_nofs_restore(nofs_flag); - - if (!bp->b_addr) - return -ENOMEM; + *bpp = NULL; + bp = kmem_cache_zalloc(xfs_buf_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + + /* + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. + */ + flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + + /* + * A new buffer is held and locked by the owner. This ensures that the + * buffer is owned by the caller and racing RCU lookups right after + * inserting into the hash table are safe (and will have to wait for + * the unlock to do anything non-trivial). + */ + bp->b_hold = 1; + sema_init(&bp->b_sema, 0); /* held, no waiters */ + + spin_lock_init(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 1); + init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); + INIT_LIST_HEAD(&bp->b_list); + INIT_LIST_HEAD(&bp->b_li_list); + bp->b_target = target; + bp->b_mount = target->bt_mount; + bp->b_flags = flags; + bp->b_rhash_key = map[0].bm_bn; + bp->b_length = 0; + bp->b_map_count = nmaps; + if (nmaps == 1) + bp->b_maps = &bp->__b_map; + else + bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; } + atomic_set(&bp->b_pin_count, 0); + init_waitqueue_head(&bp->b_waiters); + + XFS_STATS_INC(bp->b_mount, xb_create); + trace_xfs_buf_init(bp, _RET_IP_); + + error = xfs_buf_alloc_backing_mem(bp, flags); + if (error) { + xfs_buf_free(bp); + return error; + } + + *bpp = bp; return 0; } @@ -519,7 +372,6 @@ int xfs_buf_cache_init( struct xfs_buf_cache *bch) { - spin_lock_init(&bch->bc_lock); return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } @@ -582,12 +434,26 @@ xfs_buf_find_lock( return -ENOENT; } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM; bp->b_ops = NULL; } return 0; } +static bool +xfs_buf_try_hold( + struct xfs_buf *bp) +{ + spin_lock(&bp->b_lock); + if (bp->b_hold == 0) { + spin_unlock(&bp->b_lock); + return false; + } + bp->b_hold++; + spin_unlock(&bp->b_lock); + return true; +} + static inline int xfs_buf_lookup( struct xfs_buf_cache *bch, @@ -600,7 +466,7 @@ xfs_buf_lookup( rcu_read_lock(); bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); - if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { + if (!bp || !xfs_buf_try_hold(bp)) { rcu_read_unlock(); return -ENOENT; } @@ -636,37 +502,24 @@ xfs_buf_find_insert( struct xfs_buf *bp; int error; - error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); + error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) goto out_drop_pag; - if (xfs_buftarg_is_mem(new_bp->b_target)) { - error = xmbuf_map_page(new_bp); - } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { - /* - * For buffers that fit entirely within a single page, first - * attempt to allocate the memory from the heap to minimise - * memory usage. If we can't get heap memory for these small - * buffers, we fall back to using the page allocator. - */ - error = xfs_buf_alloc_pages(new_bp, flags); - } - if (error) - goto out_free_buf; + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; - spin_lock(&bch->bc_lock); + rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { + rcu_read_unlock(); error = PTR_ERR(bp); - spin_unlock(&bch->bc_lock); goto out_free_buf; } - if (bp) { + if (bp && xfs_buf_try_hold(bp)) { /* found an existing buffer */ - atomic_inc(&bp->b_hold); - spin_unlock(&bch->bc_lock); + rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -674,10 +527,8 @@ xfs_buf_find_insert( *bpp = bp; goto out_free_buf; } + rcu_read_unlock(); - /* The new buffer keeps the perag reference until it is freed. */ - new_bp->b_pag = pag; - spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; @@ -765,18 +616,6 @@ xfs_buf_get_map( xfs_perag_put(pag); } - /* We do not hold a perag reference anymore. */ - if (!bp->b_addr) { - error = _xfs_buf_map_pages(bp, flags); - if (unlikely(error)) { - xfs_warn_ratelimited(btp->bt_mount, - "%s: failed to map %u pages", __func__, - bp->b_page_count); - xfs_buf_relse(bp); - return error; - } - } - /* * Clear b_error if this is a lookup from a caller that doesn't expect * valid data to be found in the buffer. @@ -797,16 +636,14 @@ out_put_perag: int _xfs_buf_read( - struct xfs_buf *bp, - xfs_buf_flags_t flags) + struct xfs_buf *bp) { - ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); - bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - - return xfs_buf_submit(bp); + bp->b_flags |= XBF_READ; + xfs_buf_submit(bp); + return xfs_buf_iowait(bp); } /* @@ -857,6 +694,8 @@ xfs_buf_read_map( struct xfs_buf *bp; int error; + ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); + flags |= XBF_READ; *bpp = NULL; @@ -870,21 +709,11 @@ xfs_buf_read_map( /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; - error = _xfs_buf_read(bp, flags); - - /* Readahead iodone already dropped the buffer, so exit. */ - if (flags & XBF_ASYNC) - return 0; + error = _xfs_buf_read(bp); } else { /* Buffer already read; all we need to do is check it. */ error = xfs_buf_reverify(bp, ops); - /* Readahead already finished; drop the buffer and exit. */ - if (flags & XBF_ASYNC) { - xfs_buf_relse(bp); - return 0; - } - /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; ASSERT(bp->b_ops != NULL || ops == NULL); @@ -936,6 +765,7 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { + const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; struct xfs_buf *bp; /* @@ -945,9 +775,21 @@ xfs_buf_readahead_map( if (xfs_buftarg_is_mem(target)) return; - xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, - __this_address); + if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) + return; + trace_xfs_buf_readahead(bp, 0, _RET_IP_); + + if (bp->b_flags & XBF_DONE) { + xfs_buf_reverify(bp, ops); + xfs_buf_relse(bp); + return; + } + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + bp->b_flags &= ~(XBF_WRITE | XBF_DONE); + bp->b_flags |= flags; + percpu_counter_inc(&target->bt_readahead_count); + xfs_buf_submit(bp); } /* @@ -961,7 +803,6 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -970,7 +811,7 @@ xfs_buf_read_uncached( *bpp = NULL; - error = xfs_buf_get_uncached(target, numblks, flags, &bp); + error = xfs_buf_get_uncached(target, numblks, &bp); if (error) return error; @@ -982,8 +823,8 @@ xfs_buf_read_uncached( bp->b_ops = ops; xfs_buf_submit(bp); - if (bp->b_error) { - error = bp->b_error; + error = xfs_buf_iowait(bp); + if (error) { xfs_buf_relse(bp); return error; } @@ -996,40 +837,14 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; - struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - *bpp = NULL; - - /* flags might contain irrelevant bits, pass only what we care about */ - error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); - if (error) - return error; - - if (xfs_buftarg_is_mem(bp->b_target)) - error = xmbuf_map_page(bp); - else - error = xfs_buf_alloc_pages(bp, flags); - if (error) - goto fail_free_buf; - - error = _xfs_buf_map_pages(bp, 0); - if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pages", __func__); - goto fail_free_buf; - } - - trace_xfs_buf_get_uncached(bp, _RET_IP_); - *bpp = bp; - return 0; - -fail_free_buf: - xfs_buf_free(bp); + error = xfs_buf_alloc(target, &map, 1, 0, bpp); + if (!error) + trace_xfs_buf_get_uncached(*bpp, _RET_IP_); return error; } @@ -1043,7 +858,10 @@ xfs_buf_hold( struct xfs_buf *bp) { trace_xfs_buf_hold(bp, _RET_IP_); - atomic_inc(&bp->b_hold); + + spin_lock(&bp->b_lock); + bp->b_hold++; + spin_unlock(&bp->b_lock); } static void @@ -1051,10 +869,14 @@ xfs_buf_rele_uncached( struct xfs_buf *bp) { ASSERT(list_empty(&bp->b_lru)); - if (atomic_dec_and_test(&bp->b_hold)) { - xfs_buf_ioacct_dec(bp); - xfs_buf_free(bp); + + spin_lock(&bp->b_lock); + if (--bp->b_hold) { + spin_unlock(&bp->b_lock); + return; } + spin_unlock(&bp->b_lock); + xfs_buf_free(bp); } static void @@ -1064,51 +886,30 @@ xfs_buf_rele_cached( struct xfs_buftarg *btp = bp->b_target; struct xfs_perag *pag = bp->b_pag; struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); - bool release; bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); - ASSERT(atomic_read(&bp->b_hold) > 0); - - /* - * We grab the b_lock here first to serialise racing xfs_buf_rele() - * calls. The pag_buf_lock being taken on the last reference only - * serialises against racing lookups in xfs_buf_find(). IOWs, the second - * to last reference we drop here is not serialised against the last - * reference until we take bp->b_lock. Hence if we don't grab b_lock - * first, the last "release" reference can win the race to the lock and - * free the buffer before the second-to-last reference is processed, - * leading to a use-after-free scenario. - */ spin_lock(&bp->b_lock); - release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); - if (!release) { - /* - * Drop the in-flight state if the buffer is already on the LRU - * and it holds the only reference. This is racy because we - * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT - * ensures the decrement occurs only once per-buf. - */ - if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) - __xfs_buf_ioacct_dec(bp); + ASSERT(bp->b_hold >= 1); + if (bp->b_hold > 1) { + bp->b_hold--; goto out_unlock; } - /* the last reference has been dropped ... */ - __xfs_buf_ioacct_dec(bp); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* we are asked to drop the last reference */ + if (atomic_read(&bp->b_lru_ref)) { /* - * If the buffer is added to the LRU take a new reference to the + * If the buffer is added to the LRU, keep the reference to the * buffer for the LRU and clear the (now stale) dispose list - * state flag + * state flag, else drop the reference. */ - if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bch->bc_lock); + else + bp->b_hold--; } else { + bp->b_hold--; /* * most of the time buffers will already be removed from the * LRU, so optimise that case by checking for the @@ -1124,7 +925,6 @@ xfs_buf_rele_cached( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, xfs_buf_hash_params); - spin_unlock(&bch->bc_lock); if (pag) xfs_perag_put(pag); freebuf = true; @@ -1291,6 +1091,7 @@ xfs_buf_ioend_handle_error( { struct xfs_mount *mp = bp->b_mount; struct xfs_error_cfg *cfg; + struct xfs_log_item *lip; /* * If we've already shutdown the journal because of I/O errors, there's @@ -1338,12 +1139,11 @@ xfs_buf_ioend_handle_error( } /* Still considered a transient error. Caller will schedule retries. */ - if (bp->b_flags & _XBF_INODES) - xfs_buf_inode_io_fail(bp); - else if (bp->b_flags & _XBF_DQUOTS) - xfs_buf_dquot_io_fail(bp); - else - ASSERT(list_empty(&bp->b_li_list)); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + set_bit(XFS_LI_FAILED, &lip->li_flags); + clear_bit(XFS_LI_FLUSHING, &lip->li_flags); + } + xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; @@ -1351,6 +1151,7 @@ xfs_buf_ioend_handle_error( resubmit: xfs_buf_ioerror(bp, 0); bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + reinit_completion(&bp->b_iowait); xfs_buf_submit(bp); return true; out_stale: @@ -1361,24 +1162,23 @@ out_stale: return false; } -static void -xfs_buf_ioend( +/* returns false if the caller needs to resubmit the I/O, else true */ +static bool +__xfs_buf_ioend( struct xfs_buf *bp) { trace_xfs_buf_iodone(bp, _RET_IP_); - /* - * Pull in IO completion errors now. We are guaranteed to be running - * single threaded, so we don't need the lock to read b_io_error. - */ - if (!bp->b_error && bp->b_io_error) - xfs_buf_ioerror(bp, bp->b_io_error); - if (bp->b_flags & XBF_READ) { + if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) + invalidate_kernel_vmap_range(bp->b_addr, + roundup(BBTOB(bp->b_length), PAGE_SIZE)); if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; + if (bp->b_flags & XBF_READ_AHEAD) + percpu_counter_dec(&bp->b_target->bt_readahead_count); } else { if (!bp->b_error) { bp->b_flags &= ~XBF_WRITE_FAIL; @@ -1386,7 +1186,7 @@ xfs_buf_ioend( } if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) - return; + return false; /* clear the retry state */ bp->b_last_error = 0; @@ -1401,16 +1201,21 @@ xfs_buf_ioend( if (bp->b_log_item) xfs_buf_item_done(bp); - if (bp->b_flags & _XBF_INODES) - xfs_buf_inode_iodone(bp); - else if (bp->b_flags & _XBF_DQUOTS) - xfs_buf_dquot_iodone(bp); - + if (bp->b_iodone) + bp->b_iodone(bp); } bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | _XBF_LOGRECOVERY); + return true; +} +static void +xfs_buf_ioend( + struct xfs_buf *bp) +{ + if (!__xfs_buf_ioend(bp)) + return; if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); else @@ -1424,15 +1229,8 @@ xfs_buf_ioend_work( struct xfs_buf *bp = container_of(work, struct xfs_buf, b_ioend_work); - xfs_buf_ioend(bp); -} - -static void -xfs_buf_ioend_async( - struct xfs_buf *bp) -{ - INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); - queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); + if (__xfs_buf_ioend(bp)) + xfs_buf_relse(bp); } void @@ -1485,7 +1283,8 @@ xfs_bwrite( bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_DONE); - error = xfs_buf_submit(bp); + xfs_buf_submit(bp); + error = xfs_buf_iowait(bp); if (error) xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); return error; @@ -1495,188 +1294,77 @@ static void xfs_buf_bio_end_io( struct bio *bio) { - struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; + struct xfs_buf *bp = bio->bi_private; - if (!bio->bi_status && - (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) - bio->bi_status = BLK_STS_IOERR; - - /* - * don't overwrite existing errors - otherwise we can lose errors on - * buffers that require multiple bios to complete. - */ - if (bio->bi_status) { - int error = blk_status_to_errno(bio->bi_status); - - cmpxchg(&bp->b_io_error, 0, error); - } - - if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) - invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) - xfs_buf_ioend_async(bp); - bio_put(bio); -} - -static void -xfs_buf_ioapply_map( - struct xfs_buf *bp, - int map, - int *buf_offset, - int *count, - blk_opf_t op) -{ - int page_index; - unsigned int total_nr_pages = bp->b_page_count; - int nr_pages; - struct bio *bio; - sector_t sector = bp->b_maps[map].bm_bn; - int size; - int offset; - - /* skip the pages in the buffer before the start offset */ - page_index = 0; - offset = *buf_offset; - while (offset >= PAGE_SIZE) { - page_index++; - offset -= PAGE_SIZE; - } - - /* - * Limit the IO size to the length of the current vector, and update the - * remaining IO count for the next time around. - */ - size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); - *count -= size; - *buf_offset += size; - -next_chunk: - atomic_inc(&bp->b_io_remaining); - nr_pages = bio_max_segs(total_nr_pages); - - bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); - bio->bi_iter.bi_sector = sector; - bio->bi_end_io = xfs_buf_bio_end_io; - bio->bi_private = bp; - - for (; size && nr_pages; nr_pages--, page_index++) { - int rbytes, nbytes = PAGE_SIZE - offset; - - if (nbytes > size) - nbytes = size; - - rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, - offset); - if (rbytes < nbytes) - break; - - offset = 0; - sector += BTOBB(nbytes); - size -= nbytes; - total_nr_pages--; - } + if (bio->bi_status) + xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); + else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + xfs_buf_ioerror(bp, -EIO); - if (likely(bio->bi_iter.bi_size)) { - if (xfs_buf_is_vmapped(bp)) { - flush_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); - } - submit_bio(bio); - if (size) - goto next_chunk; + if (bp->b_flags & XBF_ASYNC) { + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); } else { - /* - * This is guaranteed not to be the last io reference count - * because the caller (xfs_buf_submit) holds a count itself. - */ - atomic_dec(&bp->b_io_remaining); - xfs_buf_ioerror(bp, -EIO); - bio_put(bio); + complete(&bp->b_iowait); } + bio_put(bio); } -STATIC void -_xfs_buf_ioapply( - struct xfs_buf *bp) +static inline blk_opf_t +xfs_buf_bio_op( + struct xfs_buf *bp) { - struct blk_plug plug; - blk_opf_t op; - int offset; - int size; - int i; - - /* - * Make sure we capture only current IO errors rather than stale errors - * left over from previous use of the buffer (e.g. failed readahead). - */ - bp->b_error = 0; + blk_opf_t op; if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; - - /* - * Run the write verifier callback function if it exists. If - * this function fails it will mark the buffer with an error and - * the IO should not be dispatched. - */ - if (bp->b_ops) { - bp->b_ops->verify_write(bp); - if (bp->b_error) { - xfs_force_shutdown(bp->b_mount, - SHUTDOWN_CORRUPT_INCORE); - return; - } - } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { - struct xfs_mount *mp = bp->b_mount; - - /* - * non-crc filesystems don't attach verifiers during - * log recovery, so don't warn for such filesystems. - */ - if (xfs_has_crc(mp)) { - xfs_warn(mp, - "%s: no buf ops on daddr 0x%llx len %d", - __func__, xfs_buf_daddr(bp), - bp->b_length); - xfs_hex_dump(bp->b_addr, - XFS_CORRUPTION_DUMP_LEN); - dump_stack(); - } - } } else { op = REQ_OP_READ; if (bp->b_flags & XBF_READ_AHEAD) op |= REQ_RAHEAD; } - /* we only use the buffer cache for meta-data */ - op |= REQ_META; + return op | REQ_META; +} - /* in-memory targets are directly mapped, no IO required. */ - if (xfs_buftarg_is_mem(bp->b_target)) { - xfs_buf_ioend(bp); - return; - } +static void +xfs_buf_submit_bio( + struct xfs_buf *bp) +{ + unsigned int len = BBTOB(bp->b_length); + unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len); + unsigned int map = 0; + struct blk_plug plug; + struct bio *bio; + + bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp), + GFP_NOIO); + if (is_vmalloc_addr(bp->b_addr)) + bio_add_vmalloc(bio, bp->b_addr, len); + else + bio_add_virt_nofail(bio, bp->b_addr, len); + bio->bi_private = bp; + bio->bi_end_io = xfs_buf_bio_end_io; /* - * Walk all the vectors issuing IO on them. Set up the initial offset - * into the buffer and the desired IO size before we start - - * _xfs_buf_ioapply_vec() will modify them appropriately for each - * subsequent call. + * If there is more than one map segment, split out a new bio for each + * map except of the last one. The last map is handled by the + * remainder of the original bio outside the loop. */ - offset = bp->b_offset; - size = BBTOB(bp->b_length); blk_start_plug(&plug); - for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, op); - if (bp->b_error) - break; - if (size <= 0) - break; /* all done */ + for (map = 0; map < bp->b_map_count - 1; map++) { + struct bio *split; + + split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, + &fs_bio_set); + split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; + bio_chain(split, bio); + submit_bio(split); } + bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; + submit_bio(bio); blk_finish_plug(&plug); } @@ -1689,26 +1377,55 @@ xfs_buf_iowait( { ASSERT(!(bp->b_flags & XBF_ASYNC)); - trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); - trace_xfs_buf_iowait_done(bp, _RET_IP_); + do { + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); + trace_xfs_buf_iowait_done(bp, _RET_IP_); + } while (!__xfs_buf_ioend(bp)); return bp->b_error; } /* + * Run the write verifier callback function if it exists. If this fails, mark + * the buffer with an error and do not dispatch the I/O. + */ +static bool +xfs_buf_verify_write( + struct xfs_buf *bp) +{ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) + return false; + } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { + /* + * Non-crc filesystems don't attach verifiers during log + * recovery, so don't warn for such filesystems. + */ + if (xfs_has_crc(bp->b_mount)) { + xfs_warn(bp->b_mount, + "%s: no buf ops on daddr 0x%llx len %d", + __func__, xfs_buf_daddr(bp), + bp->b_length); + xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); + dump_stack(); + } + } + + return true; +} + +/* * Buffer I/O submission path, read or write. Asynchronous submission transfers * the buffer lock ownership and the current reference to the IO. It is not * safe to reference the buffer after a call to this function unless the caller * holds an additional reference itself. */ -static int -__xfs_buf_submit( - struct xfs_buf *bp, - bool wait) +static void +xfs_buf_submit( + struct xfs_buf *bp) { - int error = 0; - trace_xfs_buf_submit(bp, _RET_IP_); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); @@ -1728,98 +1445,33 @@ __xfs_buf_submit( * state here rather than mount state to avoid corrupting the log tail * on shutdown. */ - if (bp->b_mount->m_log && - xlog_is_shutdown(bp->b_mount->m_log)) { + if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { xfs_buf_ioend_fail(bp); - return -EIO; + return; } - /* - * Grab a reference so the buffer does not go away underneath us. For - * async buffers, I/O completion drops the callers reference, which - * could occur before submission returns. - */ - xfs_buf_hold(bp); - if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); - /* clear the internal error state to avoid spurious errors */ - bp->b_io_error = 0; - /* - * Set the count to 1 initially, this will stop an I/O completion - * callout which happens before we have started all the I/O from calling - * xfs_buf_ioend too early. + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). */ - atomic_set(&bp->b_io_remaining, 1); - if (bp->b_flags & XBF_ASYNC) - xfs_buf_ioacct_inc(bp); - _xfs_buf_ioapply(bp); + bp->b_error = 0; - /* - * If _xfs_buf_ioapply failed, we can get back here with only the IO - * reference we took above. If we drop it to zero, run completion so - * that we don't return to the caller with completion still pending. - */ - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) - xfs_buf_ioend(bp); - else - xfs_buf_ioend_async(bp); + if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { + xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); + xfs_buf_ioend(bp); + return; } - if (wait) - error = xfs_buf_iowait(bp); - - /* - * Release the hold that keeps the buffer referenced for the entire - * I/O. Note that if the buffer is async, it is not safe to reference - * after this release. - */ - xfs_buf_rele(bp); - return error; -} - -void * -xfs_buf_offset( - struct xfs_buf *bp, - size_t offset) -{ - struct page *page; - - if (bp->b_addr) - return bp->b_addr + offset; - - page = bp->b_pages[offset >> PAGE_SHIFT]; - return page_address(page) + (offset & (PAGE_SIZE-1)); -} - -void -xfs_buf_zero( - struct xfs_buf *bp, - size_t boff, - size_t bsize) -{ - size_t bend; - - bend = boff + bsize; - while (boff < bend) { - struct page *page; - int page_index, page_offset, csize; - - page_index = (boff + bp->b_offset) >> PAGE_SHIFT; - page_offset = (boff + bp->b_offset) & ~PAGE_MASK; - page = bp->b_pages[page_index]; - csize = min_t(size_t, PAGE_SIZE - page_offset, - BBTOB(bp->b_length) - boff); - - ASSERT((csize + page_offset) <= PAGE_SIZE); - - memset(page_address(page) + page_offset, 0, csize); - - boff += csize; + /* In-memory targets are directly mapped, no I/O required. */ + if (xfs_buftarg_is_mem(bp->b_target)) { + xfs_buf_ioend(bp); + return; } + + xfs_buf_submit_bio(bp); } /* @@ -1857,20 +1509,20 @@ static enum lru_status xfs_buftarg_drain_rele( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) { struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); struct list_head *dispose = arg; - if (atomic_read(&bp->b_hold) > 1) { + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + if (bp->b_hold > 1) { /* need to wait, so skip it this pass */ + spin_unlock(&bp->b_lock); trace_xfs_buf_drain_buftarg(bp, _RET_IP_); return LRU_SKIP; } - if (!spin_trylock(&bp->b_lock)) - return LRU_SKIP; /* * clear the LRU reference count so the buffer doesn't get @@ -1891,9 +1543,8 @@ xfs_buftarg_wait( struct xfs_buftarg *btp) { /* - * First wait on the buftarg I/O count for all in-flight buffers to be - * released. This is critical as new buffers do not make the LRU until - * they are released. + * First wait for all in-flight readahead buffers to be released. This is + * critical as new buffers do not make the LRU until they are released. * * Next, flush the buffer workqueue to ensure all completion processing * has finished. Just waiting on buffer locks is not sufficient for @@ -1902,7 +1553,7 @@ xfs_buftarg_wait( * all reference counts have been dropped before we start walking the * LRU list. */ - while (percpu_counter_sum(&btp->bt_io_count)) + while (percpu_counter_sum(&btp->bt_readahead_count)) delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); } @@ -1956,7 +1607,6 @@ static enum lru_status xfs_buftarg_isolate( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) { struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); @@ -2020,8 +1670,8 @@ xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); - ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); - percpu_counter_destroy(&btp->bt_io_count); + ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); + percpu_counter_destroy(&btp->bt_readahead_count); list_lru_destroy(&btp->bt_lru); } @@ -2037,23 +1687,65 @@ xfs_free_buftarg( kfree(btp); } +/* + * Configure this buffer target for hardware-assisted atomic writes if the + * underlying block device supports is congruent with the filesystem geometry. + */ +static inline void +xfs_configure_buftarg_atomic_writes( + struct xfs_buftarg *btp) +{ + struct xfs_mount *mp = btp->bt_mount; + unsigned int min_bytes, max_bytes; + + min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); + max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); + + /* + * Ignore atomic write geometry that is nonsense or doesn't even cover + * a single fsblock. + */ + if (min_bytes > max_bytes || + min_bytes > mp->m_sb.sb_blocksize || + max_bytes < mp->m_sb.sb_blocksize) { + min_bytes = 0; + max_bytes = 0; + } + + btp->bt_bdev_awu_min = min_bytes; + btp->bt_bdev_awu_max = max_bytes; +} + +/* Configure a buffer target that abstracts a block device. */ int -xfs_setsize_buftarg( +xfs_configure_buftarg( struct xfs_buftarg *btp, unsigned int sectorsize) { + int error; + + ASSERT(btp->bt_bdev != NULL); + /* Set up metadata sector size info */ btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; - if (set_blocksize(btp->bt_bdev_file, sectorsize)) { + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %pg", - sectorsize, btp->bt_bdev); + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); return -EINVAL; } - return 0; + /* + * Flush the block device pagecache so our bios see anything dirtied + * before mount. + */ + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); + + return sync_blockdev(btp->bt_bdev); } int @@ -2075,7 +1767,7 @@ xfs_init_buftarg( if (list_lru_init(&btp->bt_lru)) return -ENOMEM; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) goto out_destroy_lru; btp->bt_shrinker = @@ -2089,7 +1781,7 @@ xfs_init_buftarg( return 0; out_destroy_io_count: - percpu_counter_destroy(&btp->bt_io_count); + percpu_counter_destroy(&btp->bt_readahead_count); out_destroy_lru: list_lru_destroy(&btp->bt_lru); return -ENOMEM; @@ -2102,6 +1794,8 @@ xfs_alloc_buftarg( { struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; + int error; + #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; @@ -2116,20 +1810,30 @@ xfs_alloc_buftarg( mp, ops); /* + * Flush and invalidate all devices' pagecaches before reading any + * metadata because XFS doesn't use the bdev pagecache. + */ + error = sync_blockdev(btp->bt_bdev); + if (error) + goto error_free; + + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. */ - if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) - goto error_free; - if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), - mp->m_super->s_id)) + btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; + + error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, + mp->m_super->s_id); + if (error) goto error_free; return btp; error_free: kfree(btp); - return NULL; + return ERR_PTR(error); } static inline void @@ -2203,7 +1907,7 @@ xfs_buf_delwri_queue( */ bp->b_flags |= _XBF_DELWRI_Q; if (list_empty(&bp->b_list)) { - atomic_inc(&bp->b_hold); + xfs_buf_hold(bp); list_add_tail(&bp->b_list, list); } @@ -2261,72 +1965,26 @@ xfs_buf_cmp( return 0; } -/* - * Submit buffers for write. If wait_list is specified, the buffers are - * submitted using sync I/O and placed on the wait list such that the caller can - * iowait each buffer. Otherwise async I/O is used and the buffers are released - * at I/O completion time. In either case, buffers remain locked until I/O - * completes and the buffer is released from the queue. - */ -static int -xfs_buf_delwri_submit_buffers( - struct list_head *buffer_list, - struct list_head *wait_list) +static bool +xfs_buf_delwri_submit_prep( + struct xfs_buf *bp) { - struct xfs_buf *bp, *n; - int pinned = 0; - struct blk_plug plug; - - list_sort(NULL, buffer_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait_list) { - if (!xfs_buf_trylock(bp)) - continue; - if (xfs_buf_ispinned(bp)) { - xfs_buf_unlock(bp); - pinned++; - continue; - } - } else { - xfs_buf_lock(bp); - } - - /* - * Someone else might have written the buffer synchronously or - * marked it stale in the meantime. In that case only the - * _XBF_DELWRI_Q flag got cleared, and we have to drop the - * reference and remove it from the list here. - */ - if (!(bp->b_flags & _XBF_DELWRI_Q)) { - xfs_buf_list_del(bp); - xfs_buf_relse(bp); - continue; - } - - trace_xfs_buf_delwri_split(bp, _RET_IP_); - - /* - * If we have a wait list, each buffer (and associated delwri - * queue reference) transfers to it and is submitted - * synchronously. Otherwise, drop the buffer from the delwri - * queue and submit async. - */ - bp->b_flags &= ~_XBF_DELWRI_Q; - bp->b_flags |= XBF_WRITE; - if (wait_list) { - bp->b_flags &= ~XBF_ASYNC; - list_move_tail(&bp->b_list, wait_list); - } else { - bp->b_flags |= XBF_ASYNC; - xfs_buf_list_del(bp); - } - __xfs_buf_submit(bp, false); + /* + * Someone else might have written the buffer synchronously or marked it + * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got + * cleared, and we have to drop the reference and remove it from the + * list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + xfs_buf_list_del(bp); + xfs_buf_relse(bp); + return false; } - blk_finish_plug(&plug); - return pinned; + trace_xfs_buf_delwri_split(bp, _RET_IP_); + bp->b_flags &= ~_XBF_DELWRI_Q; + bp->b_flags |= XBF_WRITE; + return true; } /* @@ -2349,7 +2007,30 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - return xfs_buf_delwri_submit_buffers(buffer_list, NULL); + struct xfs_buf *bp, *n; + int pinned = 0; + struct blk_plug plug; + + list_sort(NULL, buffer_list, xfs_buf_cmp); + + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!xfs_buf_trylock(bp)) + continue; + if (xfs_buf_ispinned(bp)) { + xfs_buf_unlock(bp); + pinned++; + continue; + } + if (!xfs_buf_delwri_submit_prep(bp)) + continue; + bp->b_flags |= XBF_ASYNC; + xfs_buf_list_del(bp); + xfs_buf_submit(bp); + } + blk_finish_plug(&plug); + + return pinned; } /* @@ -2366,9 +2047,21 @@ xfs_buf_delwri_submit( { LIST_HEAD (wait_list); int error = 0, error2; - struct xfs_buf *bp; + struct xfs_buf *bp, *n; + struct blk_plug plug; - xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); + list_sort(NULL, buffer_list, xfs_buf_cmp); + + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + xfs_buf_lock(bp); + if (!xfs_buf_delwri_submit_prep(bp)) + continue; + bp->b_flags &= ~XBF_ASYNC; + list_move_tail(&bp->b_list, &wait_list); + xfs_buf_submit(bp); + } + blk_finish_plug(&plug); /* Wait for IO to complete. */ while (!list_empty(&wait_list)) { @@ -2393,14 +2086,9 @@ xfs_buf_delwri_submit( * Push a single buffer on a delwri queue. * * The purpose of this function is to submit a single buffer of a delwri queue - * and return with the buffer still on the original queue. The waiting delwri - * buffer submission infrastructure guarantees transfer of the delwri queue - * buffer reference to a temporary wait list. We reuse this infrastructure to - * transfer the buffer back to the original queue. + * and return with the buffer still on the original queue. * - * Note the buffer transitions from the queued state, to the submitted and wait - * listed state and back to the queued state during this call. The buffer - * locking and queue management logic between _delwri_pushbuf() and + * The buffer locking and queue management logic between _delwri_pushbuf() and * _delwri_queue() guarantee that the buffer cannot be queued to another list * before returning. */ @@ -2409,33 +2097,21 @@ xfs_buf_delwri_pushbuf( struct xfs_buf *bp, struct list_head *buffer_list) { - LIST_HEAD (submit_list); int error; ASSERT(bp->b_flags & _XBF_DELWRI_Q); trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); - /* - * Isolate the buffer to a new local list so we can submit it for I/O - * independently from the rest of the original list. - */ xfs_buf_lock(bp); - list_move(&bp->b_list, &submit_list); - xfs_buf_unlock(bp); - - /* - * Delwri submission clears the DELWRI_Q buffer flag and returns with - * the buffer on the wait list with the original reference. Rather than - * bounce the buffer from a local wait list back to the original list - * after I/O completion, reuse the original list as the wait list. - */ - xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags |= XBF_WRITE; + xfs_buf_submit(bp); /* - * The buffer is now locked, under I/O and wait listed on the original - * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and - * return with the buffer unlocked and on the original queue. + * The buffer is now locked, under I/O but still on the original delwri + * queue. Wait for I/O completion, restore the DELWRI_Q flag and + * return with the buffer unlocked and still on the original queue. */ error = xfs_buf_iowait(bp); bp->b_flags |= _XBF_DELWRI_Q; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 209a389f2abc..9d2ab567cf81 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -27,19 +27,15 @@ struct xfs_buf; #define XBF_READ (1u << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ #define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ #define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ -#define _XBF_INODES (1u << 16)/* inode buffer */ -#define _XBF_DQUOTS (1u << 17)/* dquot buffer */ #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ @@ -51,7 +47,6 @@ struct xfs_buf; #define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -60,31 +55,24 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ - { _XBF_INODES, "INODES" }, \ - { _XBF_DQUOTS, "DQUOTS" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ - { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, \ - { XBF_UNMAPPED, "UNMAPPED" } + { XBF_TRYLOCK, "TRYLOCK" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ -#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { - spinlock_t bc_lock; struct rhashtable bc_hash; }; @@ -121,15 +109,17 @@ struct xfs_buftarg { struct shrinker *bt_shrinker; struct list_lru bt_lru; - struct percpu_counter bt_io_count; + struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values, bytes */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; -#define XB_PAGES 2 - struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ @@ -168,7 +158,7 @@ struct xfs_buf { xfs_daddr_t b_rhash_key; /* buffer cache index */ int b_length; /* size of buffer in BBs */ - atomic_t b_hold; /* reference count */ + unsigned int b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ @@ -180,10 +170,9 @@ struct xfs_buf { struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ - int b_io_error; /* internal IO error state */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; - struct xfs_perag *b_pag; /* contains rbtree root */ + struct xfs_perag *b_pag; struct xfs_mount *b_mount; struct xfs_buftarg *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ @@ -192,17 +181,12 @@ struct xfs_buf { struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; - struct page **b_pages; /* array of page pointers */ - struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ - atomic_t b_io_remaining; /* #outstanding I/O requests */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset of b_addr, - only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ + void (*b_iodone)(struct xfs_buf *bp); /* * async write failure retry count. Initialised to zero on the first @@ -289,11 +273,11 @@ xfs_buf_readahead( } int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp); + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); -int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); +int _xfs_buf_read(struct xfs_buf *bp); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -320,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); -void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ -extern void *xfs_buf_offset(struct xfs_buf *, size_t); +static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset) +{ + return bp->b_addr + offset; +} + +static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize) +{ + memset(bp->b_addr + boff, 0, bsize); +} + extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ @@ -382,7 +374,7 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 47549cfa61cd..90139e0f3271 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -57,24 +57,6 @@ xfs_buf_log_format_size( (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -static inline bool -xfs_buf_item_straddle( - struct xfs_buf *bp, - uint offset, - int first_bit, - int nbits) -{ - void *first, *last; - - first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); - last = xfs_buf_offset(bp, - offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); - - if (last - first != nbits * XFS_BLF_CHUNK) - return true; - return false; -} - /* * Return the number of log iovecs and space needed to log the given buf log * item segment. @@ -91,11 +73,8 @@ xfs_buf_item_size_segment( int *nvecs, int *nbytes) { - struct xfs_buf *bp = bip->bli_buf; int first_bit; int nbits; - int next_bit; - int last_bit; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (first_bit == -1) @@ -108,15 +87,6 @@ xfs_buf_item_size_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - (*nvecs)++; *nbytes += nbits * XFS_BLF_CHUNK; @@ -131,40 +101,25 @@ xfs_buf_item_size_segment( } while (first_bit != -1); return; +} -slow_scan: - /* Count the first bit we jumped out of the above loop from */ - (*nvecs)++; - *nbytes += XFS_BLF_CHUNK; - last_bit = first_bit; - while (last_bit != -1) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - last_bit + 1); - /* - * If we run out of bits, leave the loop, - * else if we find a new set of bits bump the number of vecs, - * else keep scanning the current set of bits. - */ - if (next_bit == -1) { - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - last_bit = next_bit; - first_bit = next_bit; - (*nvecs)++; - nbits = 1; - } else { - last_bit++; - nbits++; - } - *nbytes += XFS_BLF_CHUNK; - } +/* + * Compute the worst case log item overhead for an invalidated buffer with the + * given map count and block size. + */ +unsigned int +xfs_buf_inval_log_space( + unsigned int map_count, + unsigned int blocksize) +{ + unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); + unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); + unsigned int ret = + offsetof(struct xfs_buf_log_format, blf_data_map) + + (bitmap_size * sizeof_field(struct xfs_buf_log_format, + blf_data_map[0])); + + return ret * map_count; } /* @@ -277,8 +232,6 @@ xfs_buf_item_format_segment( struct xfs_buf *bp = bip->bli_buf; uint base_size; int first_bit; - int last_bit; - int next_bit; uint nbits; /* copy the flags across from the base format item */ @@ -323,15 +276,6 @@ xfs_buf_item_format_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; @@ -347,45 +291,6 @@ xfs_buf_item_format_segment( } while (first_bit != -1); return; - -slow_scan: - ASSERT(bp->b_addr == NULL); - last_bit = first_bit; - nbits = 1; - for (;;) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - (uint)last_bit + 1); - /* - * If we run out of bits fill in the last iovec and get out of - * the loop. Else if we start a new set of bits then fill in - * the iovec for the series we were looking at and start - * counting the bits in the new one. Else we're still in the - * same set of bits so just keep counting and scanning. - */ - if (next_bit == -1) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else { - last_bit++; - nbits++; - } - } } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 4d8a6aece995..e10e324cd245 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -54,21 +54,19 @@ bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); -void xfs_buf_inode_io_fail(struct xfs_buf *bp); #ifdef CONFIG_XFS_QUOTA void xfs_buf_dquot_iodone(struct xfs_buf *); -void xfs_buf_dquot_io_fail(struct xfs_buf *bp); #else static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) { } -static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) -{ -} #endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); +unsigned int xfs_buf_inval_log_space(unsigned int map_count, + unsigned int blocksize); + extern struct kmem_cache *xfs_buf_item_cache; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 5180cbf5a90b..d4c5cef5bc43 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -25,6 +25,8 @@ #include "xfs_alloc.h" #include "xfs_ag.h" #include "xfs_sb.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" /* * This is the number of entries in the l_buf_cancel_table used during @@ -260,12 +262,18 @@ xlog_recover_validate_buf_type( case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; break; + case XFS_RTRMAP_CRC_MAGIC: + bp->b_ops = &xfs_rtrmapbt_buf_ops; + break; case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; case XFS_REFC_CRC_MAGIC: bp->b_ops = &xfs_refcountbt_buf_ops; break; + case XFS_RTREFC_CRC_MAGIC: + bp->b_ops = &xfs_rtrefcountbt_buf_ops; + break; default: warnmsg = "Bad btree block magic!"; break; @@ -393,9 +401,18 @@ xlog_recover_validate_buf_type( break; #ifdef CONFIG_XFS_RT case XFS_BLFT_RTBITMAP_BUF: + if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) { + warnmsg = "Bad rtbitmap magic!"; + break; + } + bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_BITMAP); + break; case XFS_BLFT_RTSUMMARY_BUF: - /* no magic numbers for verification of RT buffers */ - bp->b_ops = &xfs_rtbuf_ops; + if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) { + warnmsg = "Bad rtsummary magic!"; + break; + } + bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_SUMMARY); break; #endif /* CONFIG_XFS_RT */ default: @@ -704,6 +721,7 @@ xlog_recover_do_primary_sb_buffer( { struct xfs_dsb *dsb = bp->b_addr; xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; + xfs_rgnumber_t orig_rgcount = mp->m_sb.sb_rgcount; int error; xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); @@ -722,17 +740,32 @@ xlog_recover_do_primary_sb_buffer( xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; } + if (mp->m_sb.sb_rgcount < orig_rgcount) { + xfs_warn(mp, + "Shrinking rtgroup count in log recovery not supported"); + return -EFSCORRUPTED; + } /* - * Growfs can also grow the last existing AG. In this case we also need - * to update the length in the in-core perag structure and values - * depending on it. + * If the last AG was grown or shrunk, we also need to update the + * length in the in-core perag structure and values depending on it. */ error = xfs_update_last_ag_size(mp, orig_agcount); if (error) return error; /* + * If the last rtgroup was grown or shrunk, we also need to update the + * length in the in-core rtgroup structure and values depending on it. + * Ignore this on any filesystem with zero rtgroups. + */ + if (orig_rgcount > 0) { + error = xfs_update_last_rtgroup_size(mp, orig_rgcount); + if (error) + return error; + } + + /* * Initialize the new perags, and also update various block and inode * allocator setting based off the number of AGs or total blocks. * Because of the latter this also needs to happen if the agcount did @@ -745,6 +778,13 @@ xlog_recover_do_primary_sb_buffer( return error; } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + + error = xfs_initialize_rtgroups(mp, orig_rgcount, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); + if (error) { + xfs_warn(mp, "Failed recovery rtgroup init: %d", error); + return error; + } return 0; } @@ -791,11 +831,20 @@ xlog_recover_get_buf_lsn( * UUIDs, so we must recover them immediately. */ blft = xfs_blft_from_flags(buf_f); - if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) + if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF || + blft == XFS_BLFT_RTSUMMARY_BUF)) goto recover_immediately; magic32 = be32_to_cpu(*(__be32 *)blk); switch (magic32) { + case XFS_RTSUMMARY_MAGIC: + case XFS_RTBITMAP_MAGIC: { + struct xfs_rtbuf_blkinfo *hdr = blk; + + lsn = be64_to_cpu(hdr->rt_lsn); + uuid = &hdr->rt_uuid; + break; + } case XFS_ABTB_CRC_MAGIC: case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: @@ -812,6 +861,8 @@ xlog_recover_get_buf_lsn( uuid = &btb->bb_u.s.bb_uuid; break; } + case XFS_RTRMAP_CRC_MAGIC: + case XFS_RTREFC_CRC_MAGIC: case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: { struct xfs_btree_block *btb = blk; @@ -955,7 +1006,6 @@ xlog_recover_buf_commit_pass2( struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; - uint buf_flags; xfs_lsn_t lsn; /* @@ -974,13 +1024,8 @@ xlog_recover_buf_commit_pass2( } trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); + 0, &bp, NULL); if (error) return error; @@ -1036,12 +1081,33 @@ xlog_recover_buf_commit_pass2( error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, current_lsn); if (error) - goto out_release; + goto out_writebuf; + + /* Update the rt superblock if we have one. */ + if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) { + struct xfs_buf *rtsb_bp = mp->m_rtsb_bp; + + xfs_buf_lock(rtsb_bp); + xfs_buf_hold(rtsb_bp); + xfs_update_rtsb(rtsb_bp, bp); + rtsb_bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(rtsb_bp, buffer_list); + xfs_buf_relse(rtsb_bp); + } } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } /* + * Buffer held by buf log item during 'normal' buffer recovery must + * be committed through buffer I/O submission path to ensure proper + * release. When error occurs during sb buffer recovery, log shutdown + * will be done before submitting buffer list so that buffers can be + * released correctly through ioend failure path. + */ +out_writebuf: + + /* * Perform delayed write on the buffer. Asynchronous writes will be * slower when taking into account all the buffers to be flushed. * diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 07bebbfb16ee..dcbfa274e06d 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -74,7 +74,7 @@ xmbuf_alloc( /* * We don't want to bother with kmapping data during repair, so don't - * allow highmem pages to back this mapping. + * allow highmem folios to back this mapping. */ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); @@ -117,7 +117,7 @@ xmbuf_free( struct xfs_buftarg *btp) { ASSERT(xfs_buftarg_is_mem(btp)); - ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); trace_xmbuf_free(btp); @@ -127,14 +127,13 @@ xmbuf_free( kfree(btp); } -/* Directly map a shmem page into the buffer cache. */ +/* Directly map a shmem folio into the buffer cache. */ int -xmbuf_map_page( +xmbuf_map_backing_mem( struct xfs_buf *bp) { struct inode *inode = file_inode(bp->b_target->bt_file); struct folio *folio = NULL; - struct page *page; loff_t pos = BBTOB(xfs_buf_daddr(bp)); int error; @@ -159,39 +158,17 @@ xmbuf_map_page( return -EIO; } - page = folio_file_page(folio, pos >> PAGE_SHIFT); - /* - * Mark the page dirty so that it won't be reclaimed once we drop the - * (potentially last) reference in xmbuf_unmap_page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfs_buf_free. */ - set_page_dirty(page); - unlock_page(page); + folio_set_dirty(folio); + folio_unlock(folio); - bp->b_addr = page_address(page); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = page; - bp->b_page_count = 1; + bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos); return 0; } -/* Unmap a shmem page that was mapped into the buffer cache. */ -void -xmbuf_unmap_page( - struct xfs_buf *bp) -{ - struct page *page = bp->b_pages[0]; - - ASSERT(xfs_buftarg_is_mem(bp->b_target)); - - put_page(page); - - bp->b_addr = NULL; - bp->b_pages[0] = NULL; - bp->b_pages = NULL; - bp->b_page_count = 0; -} - /* Is this a valid daddr within the buftarg? */ bool xmbuf_verify_daddr( @@ -205,7 +182,7 @@ xmbuf_verify_daddr( return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); } -/* Discard the page backing this buffer. */ +/* Discard the folio backing this buffer. */ static void xmbuf_stale( struct xfs_buf *bp) @@ -220,7 +197,7 @@ xmbuf_stale( } /* - * Finalize a buffer -- discard the backing page if it's stale, or run the + * Finalize a buffer -- discard the backing folio if it's stale, or run the * write verifier to detect problems. */ int diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h index eed4a7b63232..67d525cc1513 100644 --- a/fs/xfs/xfs_buf_mem.h +++ b/fs/xfs/xfs_buf_mem.h @@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr, struct xfs_buftarg **btpp); void xmbuf_free(struct xfs_buftarg *btp); -int xmbuf_map_page(struct xfs_buf *bp); -void xmbuf_unmap_page(struct xfs_buf *bp); bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); int xmbuf_finalize(struct xfs_buf *bp); #else # define xfs_buftarg_is_mem(...) (false) -# define xmbuf_map_page(...) (-ENOMEM) -# define xmbuf_unmap_page(...) ((void)0) # define xmbuf_verify_daddr(...) (false) #endif /* CONFIG_XFS_MEMORY_BUFS */ +int xmbuf_map_backing_mem(struct xfs_buf *bp); + #endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d8c4a5dcca7a..94d0873bcd62 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -21,6 +21,7 @@ #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* * Notes on an efficient, low latency fstrim algorithm @@ -72,6 +73,8 @@ * extent search so that it overlaps in flight discard IO. */ +#define XFS_DISCARD_MAX_EXAMINE (100) + struct workqueue_struct *xfs_discard_wq; static void @@ -81,13 +84,13 @@ xfs_discard_endio_work( struct xfs_busy_extents *extents = container_of(work, struct xfs_busy_extents, endio_work); - xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + xfs_extent_busy_clear(&extents->extent_list, false); kfree(extents->owner); } /* * Queue up the actual completion to a thread to avoid IRQ-safe locking for - * pagb_lock. + * eb_lock. */ static void xfs_discard_endio( @@ -100,6 +103,24 @@ xfs_discard_endio( bio_put(bio); } +static inline struct block_device * +xfs_group_bdev( + const struct xfs_group *xg) +{ + struct xfs_mount *mp = xg->xg_mount; + + switch (xg->xg_type) { + case XG_TYPE_AG: + return mp->m_ddev_targp->bt_bdev; + case XG_TYPE_RTG: + return mp->m_rtdev_targp->bt_bdev; + default: + ASSERT(0); + break; + } + return NULL; +} + /* * Walk the discard list and issue discards on all the busy extents in the * list. We plug and chain the bios so that we only need a single completion @@ -117,11 +138,11 @@ xfs_discard_extents( blk_start_plug(&plug); list_for_each_entry(busyp, &extents->extent_list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); + trace_xfs_discard_extent(busyp->group, busyp->bno, + busyp->length); - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + error = __blkdev_issue_discard(xfs_group_bdev(busyp->group), + xfs_gbno_to_daddr(busyp->group, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_KERNEL, &bio); if (error && error != -EOPNOTSUPP) { @@ -146,6 +167,14 @@ xfs_discard_extents( return error; } +/* + * Care must be taken setting up the trim cursor as the perags may not have been + * initialised when the cursor is initialised. e.g. a clean mount which hasn't + * read in AGFs and the first operation run on the mounted fs is a trim. This + * can result in perag fields that aren't initialised until + * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for + * the free space search. + */ struct xfs_trim_cur { xfs_agblock_t start; xfs_extlen_t count; @@ -160,13 +189,13 @@ xfs_trim_gather_extents( struct xfs_trim_cur *tcur, struct xfs_busy_extents *extents) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_trans *tp; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; - int batch = 100; + int batch = XFS_DISCARD_MAX_EXAMINE; /* * Force out the log. This means any transactions that might have freed @@ -183,6 +212,14 @@ xfs_trim_gather_extents( if (error) goto out_trans_cancel; + /* + * First time through tcur->count will not have been initialised as + * pag->pagf_longest is not guaranteed to be valid before we read + * the AGF buffer above. + */ + if (!tcur->count) + tcur->count = pag->pagf_longest; + if (tcur->by_bno) { /* sub-AG discard request always starts at tcur->start */ cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); @@ -239,11 +276,11 @@ xfs_trim_gather_extents( * overlapping ranges for now. */ if (fbno + flen < tcur->start) { - trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + trace_xfs_discard_exclude(pag_group(pag), fbno, flen); goto next_extent; } if (fbno > tcur->end) { - trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + trace_xfs_discard_exclude(pag_group(pag), fbno, flen); if (tcur->by_bno) { tcur->count = 0; break; @@ -261,7 +298,7 @@ xfs_trim_gather_extents( /* Too small? Give up. */ if (flen < tcur->minlen) { - trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); if (tcur->by_bno) goto next_extent; tcur->count = 0; @@ -272,12 +309,12 @@ xfs_trim_gather_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_extent_busy_search(mp, pag, fbno, flen)) { - trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen); + if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { + trace_xfs_discard_busy(pag_group(pag), fbno, flen); goto next_extent; } - xfs_extent_busy_insert_discard(pag, fbno, flen, + xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, &extents->extent_list); next_extent: if (tcur->by_bno) @@ -301,7 +338,7 @@ next_extent: * we aren't going to issue a discard on them any more. */ if (error) - xfs_extent_busy_clear(mp, &extents->extent_list, false); + xfs_extent_busy_clear(&extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); out_trans_cancel: @@ -329,13 +366,12 @@ xfs_trim_perag_extents( { struct xfs_trim_cur tcur = { .start = start, - .count = pag->pagf_longest, .end = end, .minlen = minlen, }; int error = 0; - if (start != 0 || end != pag->block_count) + if (start != 0 || end != pag_group(pag)->xg_block_count) tcur.by_bno = true; do { @@ -347,7 +383,6 @@ xfs_trim_perag_extents( break; } - extents->mount = pag->pag_mount; extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); @@ -367,7 +402,7 @@ xfs_trim_perag_extents( * list after this function call, as it may have been freed by * the time control returns to us. */ - error = xfs_discard_extents(pag->pag_mount, extents); + error = xfs_discard_extents(pag_mount(pag), extents); if (error) break; @@ -389,8 +424,8 @@ xfs_trim_datadev_extents( { xfs_agnumber_t start_agno, end_agno; xfs_agblock_t start_agbno, end_agbno; + struct xfs_perag *pag = NULL; xfs_daddr_t ddev_end; - struct xfs_perag *pag; int last_error = 0, error; ddev_end = min_t(xfs_daddr_t, end, @@ -401,10 +436,10 @@ xfs_trim_datadev_extents( end_agno = xfs_daddr_to_agno(mp, ddev_end); end_agbno = xfs_daddr_to_agbno(mp, ddev_end); - for_each_perag_range(mp, start_agno, end_agno, pag) { - xfs_agblock_t agend = pag->block_count; + while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { + xfs_agblock_t agend = pag_group(pag)->xg_block_count; - if (start_agno == end_agno) + if (pag_agno(pag) == end_agno) agend = end_agbno; error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); if (error) @@ -479,7 +514,7 @@ xfs_discard_rtdev_extents( trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); error = __blkdev_issue_discard(bdev, - XFS_FSB_TO_BB(mp, busyp->bno), + xfs_rtb_to_daddr(mp, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_NOFS, &bio); if (error) @@ -506,7 +541,7 @@ xfs_discard_rtdev_extents( static int xfs_trim_gather_rtextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -525,12 +560,12 @@ xfs_trim_gather_rtextent( return -ECANCELED; } - rbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); + rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); /* Ignore too small. */ if (rlen < tr->minlen_fsb) { - trace_xfs_discard_rttoosmall(mp, rbno, rlen); + trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); return 0; } @@ -547,70 +582,185 @@ xfs_trim_gather_rtextent( return 0; } +/* Trim extents on an !rtgroups realtime device */ static int -xfs_trim_rtdev_extents( - struct xfs_mount *mp, - xfs_daddr_t start, - xfs_daddr_t end, +xfs_trim_rtextents( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t low, + xfs_rtxnum_t high, xfs_daddr_t minlen) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_trim_rtdev tr = { .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), + .extent_list = LIST_HEAD_INIT(tr.extent_list), }; - xfs_rtxnum_t low, high; struct xfs_trans *tp; - xfs_daddr_t rtdev_daddr; int error; - INIT_LIST_HEAD(&tr.extent_list); - - /* Shift the start and end downwards to match the rt device. */ - rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - if (start > rtdev_daddr) - start -= rtdev_daddr; - else - start = 0; - - if (end <= rtdev_daddr) - return 0; - end -= rtdev_daddr; - error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - end = min_t(xfs_daddr_t, end, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1); - - /* Convert the rt blocks to rt extents */ - low = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start)); - high = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end)); - /* * Walk the free ranges between low and high. The query_range function * trims the extents returned. */ do { - tr.stop_rtx = low + (mp->m_sb.sb_blocksize * NBBY); - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_range(mp, tp, low, high, + tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_range(rtg, tp, low, high, xfs_trim_gather_rtextent, &tr); if (error == -ECANCELED) error = 0; if (error) { - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); xfs_discard_free_rtdev_extents(&tr); break; } if (list_empty(&tr.extent_list)) { - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); break; } error = xfs_discard_rtdev_extents(mp, &tr); - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) + break; + + low = tr.restart_rtx; + } while (!xfs_trim_should_stop() && low <= high); + + xfs_trans_cancel(tp); + return error; +} + +struct xfs_trim_rtgroup { + /* list of rtgroup extents to free */ + struct xfs_busy_extents *extents; + + /* minimum length that caller allows us to trim */ + xfs_rtblock_t minlen_fsb; + + /* restart point for the rtbitmap walk */ + xfs_rtxnum_t restart_rtx; + + /* number of extents to examine before stopping to issue discard ios */ + int batch; + + /* number of extents queued for discard */ + int queued; +}; + +static int +xfs_trim_gather_rtgroup_extent( + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_trim_rtgroup *tr = priv; + xfs_rgblock_t rgbno; + xfs_extlen_t len; + + if (--tr->batch <= 0) { + /* + * If we've checked a large number of extents, update the + * cursor to point at this extent so we restart the next batch + * from this extent. + */ + tr->restart_rtx = rec->ar_startext; + return -ECANCELED; + } + + rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); + len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); + + /* Ignore too small. */ + if (len < tr->minlen_fsb) { + trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); + return 0; + } + + /* + * If any blocks in the range are still busy, skip the discard and try + * again the next time. + */ + if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { + trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); + return 0; + } + + xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, + &tr->extents->extent_list); + + tr->queued++; + tr->restart_rtx = rec->ar_startext + rec->ar_extcount; + return 0; +} + +/* Trim extents in this rtgroup using the busy extent machinery. */ +static int +xfs_trim_rtgroup_extents( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t low, + xfs_rtxnum_t high, + xfs_daddr_t minlen) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_trim_rtgroup tr = { + .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), + }; + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + /* + * Walk the free ranges between low and high. The query_range function + * trims the extents returned. + */ + do { + tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); + if (!tr.extents) { + error = -ENOMEM; + break; + } + + tr.queued = 0; + tr.batch = XFS_DISCARD_MAX_EXAMINE; + tr.extents->owner = tr.extents; + INIT_LIST_HEAD(&tr.extents->extent_list); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_range(rtg, tp, low, high, + xfs_trim_gather_rtgroup_extent, &tr); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error == -ECANCELED) + error = 0; + if (error) { + kfree(tr.extents); + break; + } + + if (!tr.queued) + break; + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with + * gathering the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(rtg_mount(rtg), tr.extents); if (error) break; @@ -620,6 +770,63 @@ xfs_trim_rtdev_extents( xfs_trans_cancel(tp); return error; } + +static int +xfs_trim_rtdev_extents( + struct xfs_mount *mp, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen) +{ + xfs_rtblock_t start_rtbno, end_rtbno; + xfs_rtxnum_t start_rtx, end_rtx; + xfs_rgnumber_t start_rgno, end_rgno; + xfs_daddr_t daddr_offset; + int last_error = 0, error; + struct xfs_rtgroup *rtg = NULL; + + /* Shift the start and end downwards to match the rt device. */ + daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (start > daddr_offset) + start -= daddr_offset; + else + start = 0; + start_rtbno = xfs_daddr_to_rtb(mp, start); + start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); + start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); + + if (end <= daddr_offset) + return 0; + else + end -= daddr_offset; + end_rtbno = xfs_daddr_to_rtb(mp, end); + end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); + end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); + + while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { + xfs_rtxnum_t rtg_end = rtg->rtg_extents; + + if (rtg_rgno(rtg) == end_rgno) + rtg_end = min(rtg_end, end_rtx); + + if (xfs_has_rtgroups(mp)) + error = xfs_trim_rtgroup_extents(rtg, start_rtx, + rtg_end, minlen); + else + error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, + minlen); + if (error) + last_error = error; + + if (xfs_trim_should_stop()) { + xfs_rtgroup_rele(rtg); + break; + } + start_rtx = 0; + } + + return last_error; +} #else # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ @@ -652,7 +859,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_rtdev_targp && + + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) rt_bdev = mp->m_rtdev_targp->bt_bdev; if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c1b211c260a9..b4e32f0860b7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -69,6 +69,31 @@ xfs_dquot_mark_sick( } /* + * Detach the dquot buffer if it's still attached, because we can get called + * through dqpurge after a log shutdown. Caller must hold the dqflock or have + * otherwise isolated the dquot. + */ +void +xfs_dquot_detach_buf( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_buf *bp = NULL; + + spin_lock(&qlip->qli_lock); + if (qlip->qli_item.li_buf) { + bp = qlip->qli_item.li_buf; + qlip->qli_item.li_buf = NULL; + } + spin_unlock(&qlip->qli_lock); + if (bp) { + xfs_buf_lock(bp); + list_del_init(&qlip->qli_item.li_bio_list); + xfs_buf_relse(bp); + } +} + +/* * This is called to free all the memory associated with a dquot */ void @@ -76,6 +101,7 @@ xfs_qm_dqdestroy( struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); + ASSERT(dqp->q_logitem.qli_item.li_buf == NULL); kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); @@ -277,6 +303,25 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_set_prealloc( + struct xfs_dquot_pre *pre, + const struct xfs_dquot_res *res) +{ + xfs_qcnt_t space; + + pre->q_prealloc_hi_wmark = res->hardlimit; + pre->q_prealloc_lo_wmark = res->softlimit; + + space = div_u64(pre->q_prealloc_hi_wmark, 100); + if (!pre->q_prealloc_lo_wmark) + pre->q_prealloc_lo_wmark = space * 95; + + pre->q_low_space[XFS_QLOWSP_1_PCNT] = space; + pre->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + pre->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + /* * Initialize the dynamic speculative preallocation thresholds. The lo/hi * watermarks correspond to the soft and hard limits by default. If a soft limit @@ -285,22 +330,8 @@ xfs_qm_init_dquot_blk( void xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { - uint64_t space; - - dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; - dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; - if (!dqp->q_prealloc_lo_wmark) { - dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; - do_div(dqp->q_prealloc_lo_wmark, 100); - dqp->q_prealloc_lo_wmark *= 95; - } - - space = dqp->q_prealloc_hi_wmark; - - do_div(space, 100); - dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; - dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; - dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; + xfs_dquot_set_prealloc(&dqp->q_blk_prealloc, &dqp->q_blk); + xfs_dquot_set_prealloc(&dqp->q_rtb_prealloc, &dqp->q_rtb); } /* @@ -983,6 +1014,7 @@ xfs_qm_dqget_inode( xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_dquot(ip, type) == NULL); + ASSERT(!xfs_is_metadir_inode(ip)); id = xfs_qm_id_for_quotatype(ip, type); @@ -1136,9 +1168,11 @@ static void xfs_qm_dqflush_done( struct xfs_log_item *lip) { - struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; - struct xfs_dquot *dqp = qip->qli_dquot; + struct xfs_dq_logitem *qlip = + container_of(lip, struct xfs_dq_logitem, qli_item); + struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; + struct xfs_buf *bp = NULL; xfs_lsn_t tail_lsn; /* @@ -1150,12 +1184,11 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && - ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - spin_lock(&ailp->ail_lock); - xfs_clear_li_failed(lip); - if (lip->li_lsn == qip->qli_flush_lsn) { + clear_bit(XFS_LI_FAILED, &lip->li_flags); + if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); @@ -1165,6 +1198,20 @@ xfs_qm_dqflush_done( } /* + * If this dquot hasn't been dirtied since initiating the last dqflush, + * release the buffer reference. We already unlinked this dquot item + * from the buffer. + */ + spin_lock(&qlip->qli_lock); + if (!qlip->qli_dirty) { + bp = lip->li_buf; + lip->li_buf = NULL; + } + spin_unlock(&qlip->qli_lock); + if (bp) + xfs_buf_rele(bp); + + /* * Release the dq's flush lock since we're done with it. */ xfs_dqfunlock(dqp); @@ -1182,18 +1229,6 @@ xfs_buf_dquot_iodone( } } -void -xfs_buf_dquot_io_fail( - struct xfs_buf *bp) -{ - struct xfs_log_item *lip; - - spin_lock(&bp->b_mount->m_ail->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_set_li_failed(lip, bp); - spin_unlock(&bp->b_mount->m_ail->ail_lock); -} - /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( @@ -1233,6 +1268,115 @@ xfs_qm_dqflush_check( } /* + * Get the buffer containing the on-disk dquot. + * + * Requires dquot flush lock, will clear the dirty flag, delete the quota log + * item from the AIL, and shut down the system if something goes wrong. + */ +static int +xfs_dquot_read_buf( + struct xfs_trans *tp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp = NULL; + int error; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, + &bp, &xfs_dquot_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); + if (error) + goto out_abort; + + *bpp = bp; + return 0; + +out_abort: + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; + xfs_trans_ail_delete(&dqp->q_logitem.qli_item, 0); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + +/* + * Attach a dquot buffer to this dquot to avoid allocating a buffer during a + * dqflush, since dqflush can be called from reclaim context. Caller must hold + * the dqlock. + */ +int +xfs_dquot_attach_buf( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_log_item *lip = &qlip->qli_item; + int error; + + spin_lock(&qlip->qli_lock); + if (!lip->li_buf) { + struct xfs_buf *bp = NULL; + + spin_unlock(&qlip->qli_lock); + error = xfs_dquot_read_buf(tp, dqp, &bp); + if (error) + return error; + + /* + * Hold the dquot buffer so that we retain our ref to it after + * detaching it from the transaction, then give that ref to the + * dquot log item so that the AIL does not have to read the + * dquot buffer to push this item. + */ + xfs_buf_hold(bp); + xfs_trans_brelse(tp, bp); + + spin_lock(&qlip->qli_lock); + lip->li_buf = bp; + } + qlip->qli_dirty = true; + spin_unlock(&qlip->qli_lock); + + return 0; +} + +/* + * Get a new reference the dquot buffer attached to this dquot for a dqflush + * operation. + * + * Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked + * bp; or -EAGAIN if the buffer could not be locked. + */ +int +xfs_dquot_use_attached_buf( + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp = dqp->q_logitem.qli_item.li_buf; + + /* + * A NULL buffer can happen if the dquot dirty flag was set but the + * filesystem shut down before transaction commit happened. In that + * case we're not going to flush anyway. + */ + if (!bp) { + ASSERT(xfs_is_shutdown(dqp->q_mount)); + + *bpp = NULL; + return 0; + } + + if (!xfs_buf_trylock(bp)) + return -EAGAIN; + + xfs_buf_hold(bp); + *bpp = bp; + return 0; +} + +/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, @@ -1243,11 +1387,11 @@ xfs_qm_dqflush_check( int xfs_qm_dqflush( struct xfs_dquot *dqp, - struct xfs_buf **bpp) + struct xfs_buf *bp) { struct xfs_mount *mp = dqp->q_mount; - struct xfs_log_item *lip = &dqp->q_logitem.qli_item; - struct xfs_buf *bp; + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_log_item *lip = &qlip->qli_item; struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1257,28 +1401,12 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - *bpp = NULL; - xfs_qm_dqunpin_wait(dqp); - /* - * Get the buffer containing the on-disk dquot - */ - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, - &bp, &xfs_dquot_buf_ops); - if (error == -EAGAIN) - goto out_unlock; - if (xfs_metadata_is_sick(error)) - xfs_dquot_mark_sick(dqp); - if (error) - goto out_abort; - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); - xfs_buf_relse(bp); xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; @@ -1293,8 +1421,15 @@ xfs_qm_dqflush( */ dqp->q_flags &= ~XFS_DQFLAG_DIRTY; - xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, - &dqp->q_logitem.qli_item.li_lsn); + /* + * We hold the dquot lock, so nobody can dirty it while we're + * scheduling the write out. Clear the dirty-since-flush flag. + */ + spin_lock(&qlip->qli_lock); + qlip->qli_dirty = false; + spin_unlock(&qlip->qli_lock); + + xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory @@ -1306,7 +1441,7 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_has_crc(mp)) { - dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + dqblk->dd_lsn = cpu_to_be64(lip->li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -1315,8 +1450,8 @@ xfs_qm_dqflush( * Attach the dquot to the buffer so that we can remove this dquot from * the AIL and release the flush lock once the dquot is synced to disk. */ - bp->b_flags |= _XBF_DQUOTS; - list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); + bp->b_iodone = xfs_buf_dquot_iodone; + list_add_tail(&lip->li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't @@ -1328,14 +1463,12 @@ xfs_qm_dqflush( } trace_xfs_dqflush_done(dqp); - *bpp = bp; return 0; out_abort: dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -out_unlock: xfs_dqfunlock(dqp); return error; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 677bb2dc9ac9..61217adf5ba5 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -56,6 +56,12 @@ xfs_dquot_res_over_limits( return false; } +struct xfs_dquot_pre { + xfs_qcnt_t q_prealloc_lo_wmark; + xfs_qcnt_t q_prealloc_hi_wmark; + int64_t q_low_space[XFS_QLOWSP_MAX]; +}; + /* * The incore dquot structure */ @@ -76,9 +82,9 @@ struct xfs_dquot { struct xfs_dq_logitem q_logitem; - xfs_qcnt_t q_prealloc_lo_wmark; - xfs_qcnt_t q_prealloc_hi_wmark; - int64_t q_low_space[XFS_QLOWSP_MAX]; + struct xfs_dquot_pre q_blk_prealloc; + struct xfs_dquot_pre q_rtb_prealloc; + struct mutex q_qlock; struct completion q_flush; atomic_t q_pincount; @@ -154,6 +160,9 @@ static inline struct xfs_dquot *xfs_inode_dquot( struct xfs_inode *ip, xfs_dqtype_t type) { + if (xfs_is_metadir_inode(ip)) + return NULL; + switch (type) { case XFS_DQTYPE_USER: return ip->i_udquot; @@ -192,7 +201,11 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) int64_t freesp; freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; - if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + if (freesp < dqp->q_blk_prealloc.q_low_space[XFS_QLOWSP_1_PCNT]) + return true; + + freesp = dqp->q_rtb.hardlimit - dqp->q_rtb.reserved; + if (freesp < dqp->q_rtb_prealloc.q_low_space[XFS_QLOWSP_1_PCNT]) return true; return false; @@ -204,7 +217,7 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); -int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); +int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); @@ -227,6 +240,10 @@ void xfs_dqlockn(struct xfs_dqtrx *q); void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); +int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp); +int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp); +void xfs_dquot_detach_buf(struct xfs_dquot *dqp); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 7d19091215b0..271b195ebb93 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -123,8 +123,9 @@ xfs_qm_dquot_logitem_push( __releases(&lip->li_ailp->ail_lock) __acquires(&lip->li_ailp->ail_lock) { - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = lip->li_buf; + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + struct xfs_buf *bp; uint rval = XFS_ITEM_SUCCESS; int error; @@ -155,14 +156,25 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (error == -EAGAIN) { + xfs_dqfunlock(dqp); + rval = XFS_ITEM_LOCKED; + goto out_relock_ail; + } + + /* + * dqflush completes dqflock on error, and the delwri ioend does it on + * success. + */ + error = xfs_qm_dqflush(dqp, bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; - xfs_buf_relse(bp); - } else if (error == -EAGAIN) - rval = XFS_ITEM_LOCKED; + } + xfs_buf_relse(bp); +out_relock_ail: spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_dqunlock(dqp); @@ -195,12 +207,10 @@ xfs_qm_dquot_logitem_committing( } #ifdef DEBUG_EXPENSIVE -static int -xfs_qm_dquot_logitem_precommit( - struct xfs_trans *tp, - struct xfs_log_item *lip) +static void +xfs_qm_dquot_logitem_precommit_check( + struct xfs_dquot *dqp) { - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_mount *mp = dqp->q_mount; struct xfs_disk_dquot ddq = { }; xfs_failaddr_t fa; @@ -216,13 +226,24 @@ xfs_qm_dquot_logitem_precommit( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ASSERT(fa == NULL); } - - return 0; } #else -# define xfs_qm_dquot_logitem_precommit NULL +# define xfs_qm_dquot_logitem_precommit_check(...) ((void)0) #endif +static int +xfs_qm_dquot_logitem_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + + xfs_qm_dquot_logitem_precommit_check(dqp); + + return xfs_dquot_attach_buf(tp, dqp); +} + static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_precommit = xfs_qm_dquot_logitem_precommit, @@ -247,5 +268,7 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); + spin_lock_init(&lp->qli_lock); lp->qli_dquot = dqp; + lp->qli_dirty = false; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 794710c24474..d66e52807d76 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -14,6 +14,13 @@ struct xfs_dq_logitem { struct xfs_log_item qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ + + /* + * We use this spinlock to coordinate access to the li_buf pointer in + * the log item and the qli_dirty flag. + */ + spinlock_t qli_lock; + bool qli_dirty; /* dirtied since last flush? */ }; void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c index 7bdb9688c0f5..fa5f31931efd 100644 --- a/fs/xfs/xfs_drain.c +++ b/fs/xfs/xfs_drain.c @@ -13,28 +13,28 @@ #include "xfs_trace.h" /* - * Use a static key here to reduce the overhead of xfs_drain_rele. If the - * compiler supports jump labels, the static branch will be replaced by a nop - * sled when there are no xfs_drain_wait callers. Online fsck is currently - * the only caller, so this is a reasonable tradeoff. + * Use a static key here to reduce the overhead of xfs_defer_drain_rele. If + * the compiler supports jump labels, the static branch will be replaced by a + * nop sled when there are no xfs_defer_drain_wait callers. Online fsck is + * currently the only caller, so this is a reasonable tradeoff. * * Note: Patching the kernel code requires taking the cpu hotplug lock. Other * parts of the kernel allocate memory with that lock held, which means that * XFS callers cannot hold any locks that might be used by memory reclaim or * writeback when calling the static_branch_{inc,dec} functions. */ -static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate); +static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate); void -xfs_drain_wait_disable(void) +xfs_defer_drain_wait_disable(void) { - static_branch_dec(&xfs_drain_waiter_gate); + static_branch_dec(&xfs_defer_drain_waiter_gate); } void -xfs_drain_wait_enable(void) +xfs_defer_drain_wait_enable(void) { - static_branch_inc(&xfs_drain_waiter_gate); + static_branch_inc(&xfs_defer_drain_waiter_gate); } void @@ -71,7 +71,7 @@ static inline bool has_waiters(struct wait_queue_head *wq_head) static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr) { if (atomic_dec_and_test(&dr->dr_count) && - static_branch_unlikely(&xfs_drain_waiter_gate) && + static_branch_unlikely(&xfs_defer_drain_waiter_gate) && has_waiters(&dr->dr_waiters)) wake_up(&dr->dr_waiters); } @@ -94,55 +94,39 @@ static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) } /* - * Get a passive reference to the AG that contains a fsbno and declare an intent - * to update its metadata. + * Get a passive reference to the group that contains a fsbno and declare an + * intent to update its metadata. + * + * Other threads that need exclusive access can decide to back off if they see + * declared intentions. */ -struct xfs_perag * -xfs_perag_intent_get( +struct xfs_group * +xfs_group_intent_get( struct xfs_mount *mp, - xfs_fsblock_t fsbno) + xfs_fsblock_t fsbno, + enum xfs_group_type type) { - struct xfs_perag *pag; + struct xfs_group *xg; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno)); - if (!pag) + xg = xfs_group_get_by_fsb(mp, fsbno, type); + if (!xg) return NULL; - - xfs_perag_intent_hold(pag); - return pag; -} - -/* - * Release our intent to update this AG's metadata, and then release our - * passive ref to the AG. - */ -void -xfs_perag_intent_put( - struct xfs_perag *pag) -{ - xfs_perag_intent_rele(pag); - xfs_perag_put(pag); + trace_xfs_group_intent_hold(xg, __return_address); + xfs_defer_drain_grab(&xg->xg_intents_drain); + return xg; } /* - * Declare an intent to update AG metadata. Other threads that need exclusive - * access can decide to back off if they see declared intentions. + * Release our intent to update this groups metadata, and then release our + * passive ref to it. */ void -xfs_perag_intent_hold( - struct xfs_perag *pag) +xfs_group_intent_put( + struct xfs_group *xg) { - trace_xfs_perag_intent_hold(pag, __return_address); - xfs_defer_drain_grab(&pag->pag_intents_drain); -} - -/* Release our intent to update this AG's metadata. */ -void -xfs_perag_intent_rele( - struct xfs_perag *pag) -{ - trace_xfs_perag_intent_rele(pag, __return_address); - xfs_defer_drain_rele(&pag->pag_intents_drain); + trace_xfs_group_intent_rele(xg, __return_address); + xfs_defer_drain_rele(&xg->xg_intents_drain); + xfs_group_put(xg); } /* @@ -150,17 +134,19 @@ xfs_perag_intent_rele( * Callers must not hold any AG header buffers. */ int -xfs_perag_intent_drain( - struct xfs_perag *pag) +xfs_group_intent_drain( + struct xfs_group *xg) { - trace_xfs_perag_wait_intents(pag, __return_address); - return xfs_defer_drain_wait(&pag->pag_intents_drain); + trace_xfs_group_wait_intents(xg, __return_address); + return xfs_defer_drain_wait(&xg->xg_intents_drain); } -/* Has anyone declared an intent to update this AG? */ +/* + * Has anyone declared an intent to update this group? + */ bool -xfs_perag_intent_busy( - struct xfs_perag *pag) +xfs_group_intent_busy( + struct xfs_group *xg) { - return xfs_defer_drain_busy(&pag->pag_intents_drain); + return xfs_defer_drain_busy(&xg->xg_intents_drain); } diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h index 775164f54ea6..4d446dbf65e5 100644 --- a/fs/xfs/xfs_drain.h +++ b/fs/xfs/xfs_drain.h @@ -6,6 +6,7 @@ #ifndef XFS_DRAIN_H_ #define XFS_DRAIN_H_ +struct xfs_group; struct xfs_perag; #ifdef CONFIG_XFS_DRAIN_INTENTS @@ -25,8 +26,8 @@ struct xfs_defer_drain { void xfs_defer_drain_init(struct xfs_defer_drain *dr); void xfs_defer_drain_free(struct xfs_defer_drain *dr); -void xfs_drain_wait_disable(void); -void xfs_drain_wait_enable(void); +void xfs_defer_drain_wait_disable(void); +void xfs_defer_drain_wait_enable(void); /* * Deferred Work Intent Drains @@ -60,28 +61,26 @@ void xfs_drain_wait_enable(void); * All functions that create work items must increment the intent counter as * soon as the item is added to the transaction and cannot drop the counter * until the item is finished or cancelled. + * + * The same principles apply to realtime groups because the rt metadata inode + * ILOCKs are not held across transaction rolls. */ -struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, - xfs_fsblock_t fsbno); -void xfs_perag_intent_put(struct xfs_perag *pag); +struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp, + xfs_fsblock_t fsbno, enum xfs_group_type type); +void xfs_group_intent_put(struct xfs_group *rtg); -void xfs_perag_intent_hold(struct xfs_perag *pag); -void xfs_perag_intent_rele(struct xfs_perag *pag); +int xfs_group_intent_drain(struct xfs_group *xg); +bool xfs_group_intent_busy(struct xfs_group *xg); -int xfs_perag_intent_drain(struct xfs_perag *pag); -bool xfs_perag_intent_busy(struct xfs_perag *pag); #else struct xfs_defer_drain { /* empty */ }; #define xfs_defer_drain_free(dr) ((void)0) #define xfs_defer_drain_init(dr) ((void)0) -#define xfs_perag_intent_get(mp, fsbno) \ - xfs_perag_get((mp), XFS_FSB_TO_AGNO(mp, fsbno)) -#define xfs_perag_intent_put(pag) xfs_perag_put(pag) - -static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } -static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { } +#define xfs_group_intent_get(_mp, _fsbno, _type) \ + xfs_group_get_by_fsb((_mp), (_fsbno), (_type)) +#define xfs_group_intent_put(xg) xfs_group_put(xg) #endif /* CONFIG_XFS_DRAIN_INTENTS */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 78cdc5064a8c..dbd87e137694 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -63,6 +63,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_WB_DELAY_MS, XFS_RANDOM_WRITE_DELAY_MS, XFS_RANDOM_EXCHMAPS_FINISH_ONE, + XFS_RANDOM_METAFILE_RESV_CRITICAL, }; struct xfs_errortag_attr { @@ -181,6 +182,7 @@ XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -227,6 +229,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), XFS_ERRORTAG_ATTR_LIST(write_delay_ms), XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), + XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index 75cb53f090d1..0b41bdfecdfb 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -119,6 +119,9 @@ xfs_exchrange_reserve_quota( int ip1_error = 0; int error; + ASSERT(!xfs_is_metadir_inode(req->ip1)); + ASSERT(!xfs_is_metadir_inode(req->ip2)); + /* * Don't bother with a quota reservation if we're not enforcing them * or the two inodes have the same dquots. @@ -217,7 +220,7 @@ xfs_exchrange_mappings( * length in @fxr are safe to round up. */ if (xfs_inode_has_bigrtalloc(ip2)) - req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount); + req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount); error = xfs_exchrange_estimate(&req); if (error) @@ -326,22 +329,6 @@ out_trans_cancel: * successfully but before locks are dropped. */ -/* Verify that we have security clearance to perform this operation. */ -static int -xfs_exchange_range_verify_area( - struct xfs_exchrange *fxr) -{ - int ret; - - ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, - true); - if (ret) - return ret; - - return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, - true); -} - /* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem. @@ -352,11 +339,13 @@ xfs_exchange_range_checks( unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); + loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2); + loff_t size2 = i_size_read(inode2); uint64_t allocmask = alloc_unit - 1; int64_t test_len; uint64_t blen; - loff_t size1, size2, tmp; + loff_t tmp; int error; /* Don't touch certain kinds of inodes */ @@ -365,24 +354,25 @@ xfs_exchange_range_checks( if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY; - size1 = i_size_read(inode1); - size2 = i_size_read(inode2); - /* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL; - /* - * If the caller said to exchange to EOF, we set the length of the - * request large enough to cover everything to the end of both files. - */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + /* + * If the caller said to exchange to EOF, we set the length of + * the request large enough to cover everything to the end of + * both files. + */ fxr->length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); - - error = xfs_exchange_range_verify_area(fxr); - if (error) - return error; + } else { + /* + * Otherwise we require both ranges to end within EOF. + */ + if (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2) + return -EINVAL; } /* @@ -399,15 +389,6 @@ xfs_exchange_range_checks( return -EINVAL; /* - * We require both ranges to end within EOF, unless we're exchanging - * to EOF. - */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && - (fxr->file1_offset + fxr->length > size1 || - fxr->file2_offset + fxr->length > size2)) - return -EINVAL; - - /* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole * operation. @@ -744,6 +725,7 @@ xfs_exchange_range( { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); + loff_t check_len = fxr->length; int ret; BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & @@ -776,14 +758,18 @@ xfs_exchange_range( return -EBADF; /* - * If we're not exchanging to EOF, we can check the areas before - * stabilizing both files' i_size. + * If we're exchanging to EOF we can't calculate the length until taking + * the iolock. Pass a 0 length to remap_verify_area similar to the + * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well. */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { - ret = xfs_exchange_range_verify_area(fxr); - if (ret) - return ret; - } + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + check_len = 0; + ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); + if (ret) + return ret; + ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); + if (ret) + return ret; /* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) @@ -813,8 +799,6 @@ xfs_ioc_exchange_range( .file2 = file, }; struct xfs_exchange_range args; - struct fd file1; - int error; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; @@ -828,14 +812,12 @@ xfs_ioc_exchange_range( fxr.length = args.length; fxr.flags = args.flags; - file1 = fdget(args.file1_fd); - if (!fd_file(file1)) + CLASS(fd, file1)(args.file1_fd); + if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); - error = xfs_exchange_range(&fxr); - fdput(file1); - return error; + return xfs_exchange_range(&fxr); } /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ @@ -858,7 +840,7 @@ xfs_ioc_start_commit( struct xfs_commit_range __user *argp) { struct xfs_commit_range args = { }; - struct timespec64 ts; + struct kstat kstat = { }; struct xfs_commit_range_fresh *kern_f; struct xfs_commit_range_fresh __user *user_f; struct inode *inode2 = file_inode(file); @@ -875,12 +857,12 @@ xfs_ioc_start_commit( memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); xfs_ilock(ip2, lockflags); - ts = inode_get_ctime(inode2); - kern_f->file2_ctime = ts.tv_sec; - kern_f->file2_ctime_nsec = ts.tv_nsec; - ts = inode_get_mtime(inode2); - kern_f->file2_mtime = ts.tv_sec; - kern_f->file2_mtime_nsec = ts.tv_nsec; + /* Force writing of a distinct ctime if any writes happen. */ + fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2); + kern_f->file2_ctime = kstat.ctime.tv_sec; + kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec; + kern_f->file2_mtime = kstat.mtime.tv_sec; + kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec; kern_f->file2_ino = ip2->i_ino; kern_f->file2_gen = inode2->i_generation; kern_f->magic = XCR_FRESH_MAGIC; @@ -909,8 +891,6 @@ xfs_ioc_commit_range( struct xfs_commit_range_fresh *kern_f; struct xfs_inode *ip2 = XFS_I(file_inode(file)); struct xfs_mount *mp = ip2->i_mount; - struct fd file1; - int error; kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; @@ -934,12 +914,10 @@ xfs_ioc_commit_range( fxr.file2_ctime.tv_sec = kern_f->file2_ctime; fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; - file1 = fdget(args.file1_fd); + CLASS(fd, file1)(args.file1_fd); if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); - error = xfs_exchange_range(&fxr); - fdput(file1); - return error; + return xfs_exchange_range(&fxr); } diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index a73e7c73b664..da3161572735 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -18,15 +18,24 @@ #include "xfs_trans.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_rtgroup.h" + +struct xfs_extent_busy_tree { + spinlock_t eb_lock; + struct rb_root eb_tree; + unsigned int eb_gen; + wait_queue_head_t eb_wait; +}; static void xfs_extent_busy_insert_list( - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags, struct list_head *busy_list) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; struct rb_node **rbp; @@ -34,17 +43,17 @@ xfs_extent_busy_insert_list( new = kzalloc(sizeof(struct xfs_extent_busy), GFP_KERNEL | __GFP_NOFAIL); - new->agno = pag->pag_agno; + new->group = xfs_group_hold(xg); new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); + trace_xfs_extent_busy(xg, bno, len); - spin_lock(&pag->pagb_lock); - rbp = &pag->pagb_tree.rb_node; + spin_lock(&eb->eb_lock); + rbp = &eb->eb_tree.rb_node; while (*rbp) { parent = *rbp; busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); @@ -61,32 +70,32 @@ xfs_extent_busy_insert_list( } rb_link_node(&new->rb_node, parent, rbp); - rb_insert_color(&new->rb_node, &pag->pagb_tree); + rb_insert_color(&new->rb_node, &eb->eb_tree); /* always process discard lists in fifo order */ list_add_tail(&new->list, busy_list); - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); } void xfs_extent_busy_insert( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags) { - xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); + xfs_extent_busy_insert_list(xg, bno, len, flags, &tp->t_busy); } void xfs_extent_busy_insert_discard( - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, struct list_head *busy_list) { - xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + xfs_extent_busy_insert_list(xg, bno, len, XFS_EXTENT_BUSY_DISCARDED, busy_list); } @@ -101,18 +110,18 @@ xfs_extent_busy_insert_discard( */ int xfs_extent_busy_search( - struct xfs_mount *mp, - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct rb_node *rbp; struct xfs_extent_busy *busyp; int match = 0; /* find closest start bno overlap */ - spin_lock(&pag->pagb_lock); - rbp = pag->pagb_tree.rb_node; + spin_lock(&eb->eb_lock); + rbp = eb->eb_tree.rb_node; while (rbp) { busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); if (bno < busyp->bno) { @@ -131,7 +140,7 @@ xfs_extent_busy_search( break; } } - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); return match; } @@ -148,14 +157,15 @@ xfs_extent_busy_search( */ STATIC bool xfs_extent_busy_update_extent( - struct xfs_mount *mp, - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, - bool userdata) __releases(&pag->pagb_lock) - __acquires(&pag->pagb_lock) + bool userdata) + __releases(&eb->eb_lock) + __acquires(&eb->eb_lock) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; @@ -166,9 +176,9 @@ xfs_extent_busy_update_extent( * and retry. */ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); delay(1); - spin_lock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); return false; } @@ -241,7 +251,7 @@ xfs_extent_busy_update_extent( * tree root, because erasing the node can rearrange the * tree topology. */ - rb_erase(&busyp->rb_node, &pag->pagb_tree); + rb_erase(&busyp->rb_node, &eb->eb_tree); busyp->length = 0; return false; } else if (fend < bend) { @@ -280,35 +290,34 @@ xfs_extent_busy_update_extent( ASSERT(0); } - trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + trace_xfs_extent_busy_reuse(xg, fbno, flen); return true; out_force_log: - spin_unlock(&pag->pagb_lock); - xfs_log_force(mp, XFS_LOG_SYNC); - trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); - spin_lock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); + xfs_log_force(xg->xg_mount, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(xg, fbno, flen); + spin_lock(&eb->eb_lock); return false; } - /* * For a given extent [fbno, flen], make sure we can reuse it safely. */ void xfs_extent_busy_reuse( - struct xfs_mount *mp, - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct rb_node *rbp; ASSERT(flen > 0); - spin_lock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); restart: - rbp = pag->pagb_tree.rb_node; + rbp = eb->eb_tree.rb_node; while (rbp) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); @@ -323,11 +332,11 @@ restart: continue; } - if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + if (!xfs_extent_busy_update_extent(xg, busyp, fbno, flen, userdata)) goto restart; } - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); } /* @@ -336,7 +345,7 @@ restart: * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. * - * Return the current busy generation for the AG if the extent is busy. This + * Return the current busy generation for the group if the extent is busy. This * value can be used to wait for at least one of the currently busy extents * to be cleared. Note that the busy list is not guaranteed to be empty after * the gen is woken. The state of a specific extent must always be confirmed @@ -344,11 +353,14 @@ restart: */ bool xfs_extent_busy_trim( - struct xfs_alloc_arg *args, + struct xfs_group *xg, + xfs_extlen_t minlen, + xfs_extlen_t maxlen, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; xfs_agblock_t fbno; xfs_extlen_t flen; struct rb_node *rbp; @@ -356,11 +368,11 @@ xfs_extent_busy_trim( ASSERT(*len > 0); - spin_lock(&args->pag->pagb_lock); + spin_lock(&eb->eb_lock); fbno = *bno; flen = *len; - rbp = args->pag->pagb_tree.rb_node; - while (rbp && flen >= args->minlen) { + rbp = eb->eb_tree.rb_node; + while (rbp && flen >= minlen) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); xfs_agblock_t fend = fbno + flen; @@ -481,13 +493,13 @@ xfs_extent_busy_trim( * good chance subsequent allocations will be * contiguous. */ - if (bbno - fbno >= args->maxlen) { + if (bbno - fbno >= maxlen) { /* left candidate fits perfect */ fend = bbno; - } else if (fend - bend >= args->maxlen * 4) { + } else if (fend - bend >= maxlen * 4) { /* right candidate has enough free space */ fbno = bend; - } else if (bbno - fbno >= args->minlen) { + } else if (bbno - fbno >= minlen) { /* left candidate fits minimum requirement */ fend = bbno; } else { @@ -500,14 +512,13 @@ xfs_extent_busy_trim( out: if (fbno != *bno || flen != *len) { - trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, - fbno, flen); + trace_xfs_extent_busy_trim(xg, *bno, *len, fbno, flen); *bno = fbno; *len = flen; - *busy_gen = args->pag->pagb_gen; + *busy_gen = eb->eb_gen; ret = true; } - spin_unlock(&args->pag->pagb_lock); + spin_unlock(&eb->eb_lock); return ret; fail: /* @@ -520,22 +531,24 @@ fail: static bool xfs_extent_busy_clear_one( - struct xfs_perag *pag, struct xfs_extent_busy *busyp, bool do_discard) { + struct xfs_extent_busy_tree *eb = busyp->group->xg_busy_extents; + if (busyp->length) { if (do_discard && !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; return false; } - trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno, - busyp->bno, busyp->length); - rb_erase(&busyp->rb_node, &pag->pagb_tree); + trace_xfs_extent_busy_clear(busyp->group, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &eb->eb_tree); } list_del_init(&busyp->list); + xfs_group_put(busyp->group); kfree(busyp); return true; } @@ -547,7 +560,6 @@ xfs_extent_busy_clear_one( */ void xfs_extent_busy_clear( - struct xfs_mount *mp, struct list_head *list, bool do_discard) { @@ -558,30 +570,30 @@ xfs_extent_busy_clear( return; do { + struct xfs_group *xg = xfs_group_hold(busyp->group); + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; bool wakeup = false; - struct xfs_perag *pag; - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); do { next = list_next_entry(busyp, list); - if (xfs_extent_busy_clear_one(pag, busyp, do_discard)) + if (xfs_extent_busy_clear_one(busyp, do_discard)) wakeup = true; busyp = next; } while (!list_entry_is_head(busyp, list, list) && - busyp->agno == pag->pag_agno); + busyp->group == xg); if (wakeup) { - pag->pagb_gen++; - wake_up_all(&pag->pagb_wait); + eb->eb_gen++; + wake_up_all(&eb->eb_wait); } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); + spin_unlock(&eb->eb_lock); + xfs_group_put(xg); } while (!list_entry_is_head(busyp, list, list)); } /* - * Flush out all busy extents for this AG. + * Flush out all busy extents for this group. * * If the current transaction is holding busy extents, the caller may not want * to wait for committed busy extents to resolve. If we are being told just to @@ -597,10 +609,11 @@ xfs_extent_busy_clear( int xfs_extent_busy_flush( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, unsigned busy_gen, uint32_t alloc_flags) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; DEFINE_WAIT (wait); int error; @@ -613,7 +626,7 @@ xfs_extent_busy_flush( if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) return 0; - if (busy_gen != READ_ONCE(pag->pagb_gen)) + if (busy_gen != READ_ONCE(eb->eb_gen)) return 0; if (alloc_flags & XFS_ALLOC_FLAG_FREEING) @@ -622,37 +635,49 @@ xfs_extent_busy_flush( /* Wait for committed busy extents to resolve. */ do { - prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); - if (busy_gen != READ_ONCE(pag->pagb_gen)) + prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); + if (busy_gen != READ_ONCE(eb->eb_gen)) break; schedule(); } while (1); - finish_wait(&pag->pagb_wait, &wait); + finish_wait(&eb->eb_wait, &wait); return 0; } +static void +xfs_extent_busy_wait_group( + struct xfs_group *xg) +{ + DEFINE_WAIT (wait); + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; + + do { + prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); + if (RB_EMPTY_ROOT(&eb->eb_tree)) + break; + schedule(); + } while (1); + finish_wait(&eb->eb_wait, &wait); +} + void xfs_extent_busy_wait_all( struct xfs_mount *mp) { - struct xfs_perag *pag; - DEFINE_WAIT (wait); - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; - for_each_perag(mp, agno, pag) { - do { - prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); - if (RB_EMPTY_ROOT(&pag->pagb_tree)) - break; - schedule(); - } while (1); - finish_wait(&pag->pagb_wait, &wait); - } + while ((pag = xfs_perag_next(mp, pag))) + xfs_extent_busy_wait_group(pag_group(pag)); + + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_extent_busy_wait_group(rtg_group(rtg)); } /* - * Callback for list_sort to sort busy extents by the AG they reside in. + * Callback for list_sort to sort busy extents by the group they reside in. */ int xfs_extent_busy_ag_cmp( @@ -666,21 +691,38 @@ xfs_extent_busy_ag_cmp( container_of(l2, struct xfs_extent_busy, list); s32 diff; - diff = b1->agno - b2->agno; + diff = b1->group->xg_gno - b2->group->xg_gno; if (!diff) diff = b1->bno - b2->bno; return diff; } -/* Are there any busy extents in this AG? */ +/* Are there any busy extents in this group? */ bool xfs_extent_busy_list_empty( - struct xfs_perag *pag) + struct xfs_group *xg, + unsigned *busy_gen) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; bool res; - spin_lock(&pag->pagb_lock); - res = RB_EMPTY_ROOT(&pag->pagb_tree); - spin_unlock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); + res = RB_EMPTY_ROOT(&eb->eb_tree); + *busy_gen = READ_ONCE(eb->eb_gen); + spin_unlock(&eb->eb_lock); return res; } + +struct xfs_extent_busy_tree * +xfs_extent_busy_alloc(void) +{ + struct xfs_extent_busy_tree *eb; + + eb = kzalloc(sizeof(*eb), GFP_KERNEL); + if (!eb) + return NULL; + spin_lock_init(&eb->eb_lock); + init_waitqueue_head(&eb->eb_wait); + eb->eb_tree = RB_ROOT; + return eb; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 470032de3139..f069b04e8ea1 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -8,19 +8,18 @@ #ifndef __XFS_EXTENT_BUSY_H__ #define __XFS_EXTENT_BUSY_H__ +struct xfs_group; struct xfs_mount; -struct xfs_perag; struct xfs_trans; -struct xfs_alloc_arg; /* - * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that - * have been freed but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in the group to mark blocks + * that have been freed but whose transactions aren't committed to disk yet. */ struct xfs_extent_busy { - struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct rb_node rb_node; /* group by-bno indexed search tree */ struct list_head list; /* transaction busy extent list */ - xfs_agnumber_t agno; + struct xfs_group *group; xfs_agblock_t bno; xfs_extlen_t length; unsigned int flags; @@ -33,7 +32,6 @@ struct xfs_extent_busy { * to discard completion. */ struct xfs_busy_extents { - struct xfs_mount *mount; struct list_head extent_list; struct work_struct endio_work; @@ -45,46 +43,29 @@ struct xfs_busy_extents { void *owner; }; -void -xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); - -void -xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, - xfs_extlen_t len, struct list_head *busy_list); - -void -xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); - -int -xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag, - xfs_agblock_t bno, xfs_extlen_t len); - -void -xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag, - xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); - -bool -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, - xfs_extlen_t *len, unsigned *busy_gen); - -int -xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, +void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_group *xg, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); +void xfs_extent_busy_insert_discard(struct xfs_group *xg, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); +void xfs_extent_busy_clear(struct list_head *list, bool do_discard); +int xfs_extent_busy_search(struct xfs_group *xg, xfs_agblock_t bno, + xfs_extlen_t len); +void xfs_extent_busy_reuse(struct xfs_group *xg, xfs_agblock_t fbno, + xfs_extlen_t flen, bool userdata); +bool xfs_extent_busy_trim(struct xfs_group *xg, xfs_extlen_t minlen, + xfs_extlen_t maxlen, xfs_agblock_t *bno, xfs_extlen_t *len, + unsigned *busy_gen); +int xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_group *xg, unsigned busy_gen, uint32_t alloc_flags); +void xfs_extent_busy_wait_all(struct xfs_mount *mp); +bool xfs_extent_busy_list_empty(struct xfs_group *xg, unsigned int *busy_gen); +struct xfs_extent_busy_tree *xfs_extent_busy_alloc(void); -void -xfs_extent_busy_wait_all(struct xfs_mount *mp); - -int -xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a, - const struct list_head *b); - +int xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a, + const struct list_head *b); static inline void xfs_extent_busy_sort(struct list_head *list) { list_sort(NULL, list, xfs_extent_busy_ag_cmp); } -bool xfs_extent_busy_list_empty(struct xfs_perag *pag); - #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index abffc74a924f..d574f5f639fa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -25,6 +25,11 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_efi_cache; struct kmem_cache *xfs_efd_cache; @@ -78,6 +83,11 @@ xfs_efi_item_size( *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } +unsigned int xfs_efi_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efi_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efi log item. We use only 1 iovec, and we point that @@ -95,16 +105,15 @@ xfs_efi_item_format( ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); + ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT); - efip->efi_format.efi_type = XFS_LI_EFI; + efip->efi_format.efi_type = lip->li_type; efip->efi_format.efi_size = 1; - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, - &efip->efi_format, + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format, xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents)); } - /* * The unpin operation is the last place an EFI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In @@ -140,12 +149,14 @@ xfs_efi_item_release( STATIC struct xfs_efi_log_item * xfs_efi_init( struct xfs_mount *mp, + unsigned short item_type, uint nextents) - { struct xfs_efi_log_item *efip; + ASSERT(item_type == XFS_LI_EFI || item_type == XFS_LI_EFI_RT); ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { efip = kzalloc(xfs_efi_log_item_sizeof(nextents), GFP_KERNEL | __GFP_NOFAIL); @@ -154,7 +165,7 @@ xfs_efi_init( GFP_KERNEL | __GFP_NOFAIL); } - xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + xfs_log_item_init(mp, &efip->efi_item, item_type, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); @@ -248,6 +259,11 @@ xfs_efd_item_size( *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } +unsigned int xfs_efd_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efd_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efd log item. We use only 1 iovec, and we point that @@ -264,12 +280,12 @@ xfs_efd_item_format( struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); + ASSERT(lip->li_type == XFS_LI_EFD || lip->li_type == XFS_LI_EFD_RT); - efdp->efd_format.efd_type = XFS_LI_EFD; + efdp->efd_format.efd_type = lip->li_type; efdp->efd_format.efd_size = 1; - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, - &efdp->efd_format, + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format, xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents)); } @@ -308,6 +324,14 @@ static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e) return list_entry(e, struct xfs_extent_free_item, xefi_list); } +static inline bool +xfs_efi_item_isrt(const struct xfs_log_item *lip) +{ + ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT); + + return lip->li_type == XFS_LI_EFI_RT; +} + /* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. @@ -362,7 +386,7 @@ xfs_extent_free_diff_items( struct xfs_extent_free_item *ra = xefi_entry(a); struct xfs_extent_free_item *rb = xefi_entry(b); - return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno; + return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno; } /* Log a free extent to the intent item. */ @@ -388,18 +412,20 @@ xfs_extent_free_log_item( } static struct xfs_log_item * -xfs_extent_free_create_intent( +__xfs_extent_free_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, - bool sort) + bool sort, + unsigned short item_type) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_efi_log_item *efip; struct xfs_extent_free_item *xefi; ASSERT(count > 0); + efip = xfs_efi_init(mp, item_type, count); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -407,6 +433,23 @@ xfs_extent_free_create_intent( return &efip->efi_item; } +static struct xfs_log_item * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_extent_free_create_intent(tp, items, count, sort, + XFS_LI_EFI); +} + +static inline unsigned short +xfs_efd_type_from_efi(const struct xfs_efi_log_item *efip) +{ + return xfs_efi_item_isrt(&efip->efi_item) ? XFS_LI_EFD_RT : XFS_LI_EFD; +} + /* Get an EFD so we can process all the free extents. */ static struct xfs_log_item * xfs_extent_free_create_done( @@ -427,8 +470,8 @@ xfs_extent_free_create_done( GFP_KERNEL | __GFP_NOFAIL); } - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, + xfs_efd_type_from_efi(efip), &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = count; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; @@ -436,6 +479,17 @@ xfs_extent_free_create_done( return &efdp->efd_item; } +static inline const struct xfs_defer_op_type * +xefi_ops( + struct xfs_extent_free_item *xefi) +{ + if (xfs_efi_is_realtime(xefi)) + return &xfs_rtextent_free_defer_type; + if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) + return &xfs_agfl_free_defer_type; + return &xfs_extent_free_defer_type; +} + /* Add this deferred EFI to the transaction. */ void xfs_extent_free_defer_add( @@ -445,15 +499,11 @@ xfs_extent_free_defer_add( { struct xfs_mount *mp = tp->t_mountp; - trace_xfs_extent_free_defer(mp, xefi); + xefi->xefi_group = xfs_group_intent_get(mp, xefi->xefi_startblock, + xfs_efi_is_realtime(xefi) ? XG_TYPE_RTG : XG_TYPE_AG); - xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock); - if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, - &xfs_agfl_free_defer_type); - else - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, - &xfs_extent_free_defer_type); + trace_xfs_extent_free_defer(mp, xefi); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, xefi_ops(xefi)); } /* Cancel a free extent. */ @@ -463,7 +513,7 @@ xfs_extent_free_cancel_item( { struct xfs_extent_free_item *xefi = xefi_entry(item); - xfs_perag_intent_put(xefi->xefi_pag); + xfs_group_intent_put(xefi->xefi_group); kmem_cache_free(xfs_extfree_item_cache, xefi); } @@ -499,7 +549,7 @@ xfs_extent_free_finish_item( * in this EFI to the EFD so this works correctly. */ if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + error = __xfs_free_extent(tp, to_perag(xefi->xefi_group), agbno, xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); if (error == -EAGAIN) { @@ -545,10 +595,10 @@ xfs_agfl_free_finish_item( trace_xfs_agfl_free_deferred(mp, xefi); - error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); + error = xfs_alloc_read_agf(to_perag(xefi->xefi_group), tp, 0, &agbp); if (!error) - error = xfs_free_ag_extent(tp, agbp, xefi->xefi_pag->pag_agno, - agbno, 1, &oinfo, XFS_AG_RESV_AGFL); + error = xfs_free_ag_extent(tp, agbp, agbno, 1, &oinfo, + XFS_AG_RESV_AGFL); xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(&xefi->xefi_list); @@ -559,8 +609,12 @@ xfs_agfl_free_finish_item( static inline bool xfs_efi_validate_ext( struct xfs_mount *mp, + bool isrt, struct xfs_extent *extp) { + if (isrt) + return xfs_verify_rtbext(mp, extp->ext_start, extp->ext_len); + return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } @@ -568,6 +622,7 @@ static inline void xfs_efi_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + bool isrt, struct xfs_extent *extp) { struct xfs_extent_free_item *xefi; @@ -578,7 +633,10 @@ xfs_efi_recover_work( xefi->xefi_blockcount = extp->ext_len; xefi->xefi_agresv = XFS_AG_RESV_NONE; xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; - xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start); + xefi->xefi_group = xfs_group_intent_get(mp, extp->ext_start, + isrt ? XG_TYPE_RTG : XG_TYPE_AG); + if (isrt) + xefi->xefi_flags |= XFS_EFI_REALTIME; xfs_defer_add_item(dfp, &xefi->xefi_list); } @@ -599,14 +657,15 @@ xfs_extent_free_recover_work( struct xfs_trans *tp; int i; int error = 0; + bool isrt = xfs_efi_item_isrt(lip); /* - * First check the validity of the extents described by the - * EFI. If any are bad, then assume that all are bad and - * just toss the EFI. + * First check the validity of the extents described by the EFI. If + * any are bad, then assume that all are bad and just toss the EFI. + * Mixing RT and non-RT extents in the same EFI item is not allowed. */ for (i = 0; i < efip->efi_format.efi_nextents; i++) { - if (!xfs_efi_validate_ext(mp, + if (!xfs_efi_validate_ext(mp, isrt, &efip->efi_format.efi_extents[i])) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &efip->efi_format, @@ -614,7 +673,8 @@ xfs_extent_free_recover_work( return -EFSCORRUPTED; } - xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); + xfs_efi_recover_work(mp, dfp, isrt, + &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -652,10 +712,12 @@ xfs_extent_free_relog_intent( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; + ASSERT(intent->li_type == XFS_LI_EFI || intent->li_type == XFS_LI_EFI_RT); + efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - efip = xfs_efi_init(tp->t_mountp, count); + efip = xfs_efi_init(tp->t_mountp, intent->li_type, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); @@ -687,6 +749,86 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .relog_intent = xfs_extent_free_relog_intent, }; +#ifdef CONFIG_XFS_RT +/* Create a realtime extent freeing */ +static struct xfs_log_item * +xfs_rtextent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_extent_free_create_intent(tp, items, count, sort, + XFS_LI_EFI_RT); +} + +/* Process a free realtime extent. */ +STATIC int +xfs_rtextent_free_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent_free_item *xefi = xefi_entry(item); + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_rtgroup **rtgp = (struct xfs_rtgroup **)state; + int error = 0; + + trace_xfs_extent_free_deferred(mp, xefi); + + if (xefi->xefi_flags & XFS_EFI_CANCELLED) + goto done; + + if (*rtgp != to_rtg(xefi->xefi_group)) { + unsigned int lock_flags; + + if (xfs_has_zoned(mp)) + lock_flags = XFS_RTGLOCK_RMAP; + else + lock_flags = XFS_RTGLOCK_BITMAP; + + *rtgp = to_rtg(xefi->xefi_group); + xfs_rtgroup_lock(*rtgp, lock_flags); + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags); + } + + if (xfs_has_zoned(mp)) { + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } else { + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } + + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); + return error; + } +done: + xfs_efd_add_extent(efdp, xefi); + xfs_extent_free_cancel_item(item); + return error; +} + +const struct xfs_defer_op_type xfs_rtextent_free_defer_type = { + .name = "rtextent_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_rtextent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_rtextent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; +#else +const struct xfs_defer_op_type xfs_rtextent_free_defer_type = { + .name = "rtextent_free", +}; +#endif /* CONFIG_XFS_RT */ + STATIC bool xfs_efi_item_match( struct xfs_log_item *lip, @@ -731,7 +873,7 @@ xlog_recover_efi_commit_pass2( return -EFSCORRUPTED; } - efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { xfs_efi_item_free(efip); @@ -749,6 +891,58 @@ const struct xlog_recover_item_ops xlog_efi_item_ops = { .commit_pass2 = xlog_recover_efi_commit_pass2, }; +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtefi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; + int error; + + efi_formatp = item->ri_buf[0].i_addr; + + if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents); + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + &xfs_rtextent_free_defer_type); + return 0; +} +#else +STATIC int +xlog_recover_rtefi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; +} +#endif + +const struct xlog_recover_item_ops xlog_rtefi_item_ops = { + .item_type = XFS_LI_EFI_RT, + .commit_pass2 = xlog_recover_rtefi_commit_pass2, +}; + /* * This routine is called when an EFD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding EFI if it @@ -791,3 +985,44 @@ const struct xlog_recover_item_ops xlog_efd_item_ops = { .item_type = XFS_LI_EFD, .commit_pass2 = xlog_recover_efd_commit_pass2, }; + +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtefd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_format *efd_formatp; + int buflen = item->ri_buf[0].i_len; + + efd_formatp = item->ri_buf[0].i_addr; + + if (buflen < sizeof(struct xfs_efd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + efd_formatp->efd_nextents) && + item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + efd_formatp->efd_nextents)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_EFI_RT, + efd_formatp->efd_efi_id); + return 0; +} +#else +# define xlog_recover_rtefd_commit_pass2 xlog_recover_rtefi_commit_pass2 +#endif + +const struct xlog_recover_item_ops xlog_rtefd_item_ops = { + .item_type = XFS_LI_EFD_RT, + .commit_pass2 = xlog_recover_rtefd_commit_pass2, +}; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 41b7c4306079..c8402040410b 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp, struct xfs_extent_free_item *xefi, struct xfs_defer_pending **dfpp); +unsigned int xfs_efi_log_space(unsigned int nr); +unsigned int xfs_efd_log_space(unsigned int nr); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b19916b11fd5..48254a72071b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,8 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -150,7 +152,7 @@ xfs_file_fsync( * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); @@ -360,7 +362,8 @@ xfs_file_write_zero_eof( struct iov_iter *from, unsigned int *iolock, size_t count, - bool *drained_dio) + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); loff_t isize; @@ -414,7 +417,7 @@ xfs_file_write_zero_eof( trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; @@ -431,7 +434,8 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - unsigned int *iolock) + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); @@ -481,7 +485,7 @@ restart: */ if (iocb->ki_pos > i_size_read(inode)) { error = xfs_file_write_zero_eof(iocb, from, iolock, count, - &drained_dio); + &drained_dio, ac); if (error == 1) goto restart; if (error) @@ -491,6 +495,48 @@ restart: return kiocb_modified(iocb); } +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; + + /* + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. + * + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. + */ + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; + + /* + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. + */ + return xfs_zoned_space_reserve(ip, + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -503,6 +549,9 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + trace_xfs_end_io_direct_write(ip, offset, size); if (xfs_is_shutdown(ip->i_mount)) @@ -527,7 +576,10 @@ xfs_dio_write_end_io( nofs_flag = memalloc_nofs_save(); if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); if (error) goto out; } @@ -582,14 +634,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { .end_io = xfs_dio_write_end_io, }; +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; + } + ac->reserved_blocks -= count_fsb; + + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); +} + +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + /* - * Handle block aligned direct I/O writes + * Handle block aligned direct I/O writes. */ static noinline ssize_t xfs_file_dio_write_aligned( struct xfs_inode *ip, struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; @@ -597,7 +686,7 @@ xfs_file_dio_write_aligned( ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, ac); if (ret) goto out_unlock; @@ -611,8 +700,94 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, NULL, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); +out_unlock: + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} + +/* + * Handle block atomic writes + * + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written + * + */ +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; + + /* + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. + */ + if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; + +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + + /* + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. + */ + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -675,7 +850,7 @@ retry_exclusive: goto out_unlock; } - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out_unlock; @@ -721,9 +896,23 @@ xfs_file_dio_write( /* direct I/O must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + + /* + * For always COW inodes we also must check the alignment of each + * individual iovec segment, as they could end up with different + * I/Os due to the way bio_iov_iter_get_pages works, and we'd + * then overwrite an already written block. + */ + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || + (xfs_is_always_cow_inode(ip) && + (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) return xfs_file_dio_write_unaligned(ip, iocb, from); - return xfs_file_dio_write_aligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } static noinline ssize_t @@ -740,7 +929,7 @@ xfs_file_dax_write( ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -784,7 +973,7 @@ write_retry: if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -832,6 +1021,67 @@ out: } STATIC ssize_t +xfs_file_buffered_write_zoned( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; + + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); + if (ret) + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) @@ -852,6 +1102,18 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + if (ocount < xfs_get_atomic_write_min(ip)) + return -EINVAL; + + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -864,6 +1126,8 @@ xfs_file_write_iter( return ret; } + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); return xfs_file_buffered_write(iocb, from); } @@ -918,7 +1182,8 @@ static int xfs_falloc_collapse_range( struct file *file, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); loff_t new_size = i_size_read(inode) - len; @@ -934,7 +1199,7 @@ xfs_falloc_collapse_range( if (offset + len >= i_size_read(inode)) return -EINVAL; - error = xfs_collapse_file_space(XFS_I(inode), offset, len); + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); if (error) return error; return xfs_falloc_setsize(file, new_size); @@ -990,7 +1255,8 @@ xfs_falloc_zero_range( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); unsigned int blksize = i_blocksize(inode); @@ -1003,7 +1269,7 @@ xfs_falloc_zero_range( if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len); + error = xfs_free_file_space(XFS_I(inode), offset, len, ac); if (error) return error; @@ -1074,22 +1340,18 @@ xfs_falloc_allocate_range( FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long -xfs_file_fallocate( +__xfs_file_fallocate( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (mode & ~XFS_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - xfs_ilock(ip, iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) @@ -1110,16 +1372,16 @@ xfs_file_fallocate( switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); break; case FALLOC_FL_COLLAPSE_RANGE: - error = xfs_falloc_collapse_range(file, offset, len); + error = xfs_falloc_collapse_range(file, offset, len, ac); break; case FALLOC_FL_INSERT_RANGE: error = xfs_falloc_insert_range(file, offset, len); break; case FALLOC_FL_ZERO_RANGE: - error = xfs_falloc_zero_range(file, mode, offset, len); + error = xfs_falloc_zero_range(file, mode, offset, len, ac); break; case FALLOC_FL_UNSHARE_RANGE: error = xfs_falloc_unshare_range(file, mode, offset, len); @@ -1140,6 +1402,54 @@ out_unlock: return error; } +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return error; +} + +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + STATIC int xfs_file_fadvise( struct file *file, @@ -1228,6 +1538,14 @@ out_unlock: xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + /* + * If the caller did not set CAN_SHORTEN, then it is not prepared to + * handle partial results -- either the whole remap succeeds, or we + * must say why it did not. In this case, any error should be returned + * to the caller. + */ + if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return ret; return remapped > 0 ? remapped : ret; } @@ -1239,6 +1557,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } @@ -1323,15 +1643,22 @@ xfs_file_release( * blocks. This avoids open/read/close workloads from removing EOF * blocks that other writers depend upon to reduce fragmentation. * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* * If we can't get the iolock just skip truncating the blocks past EOF * because we could deadlock with the mmap_lock otherwise. We'll get * another chance to drop them once the last reference to the inode is * dropped, so we'll never leak blocks permanently. */ - if (inode->i_nlink && - (file->f_mode & FMODE_WRITE) && - !(ip->i_diflags & XFS_DIFLAG_APPEND) && - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (xfs_can_free_eofblocks(ip) && !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) @@ -1425,6 +1752,8 @@ xfs_dax_read_fault( struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); vm_fault_t ret; + trace_xfs_read_fault(ip, order); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault_locked(vmf, order, false); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); @@ -1432,16 +1761,29 @@ xfs_dax_read_fault( return ret; } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_lock (MM) + * sb_start_pagefault(vfs, freeze) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ static vm_fault_t -xfs_write_fault( +__xfs_write_fault( struct vm_fault *vmf, - unsigned int order) + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); unsigned int lock_mode = XFS_MMAPLOCK_SHARED; vm_fault_t ret; + trace_xfs_write_fault(ip, order); + sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); @@ -1460,38 +1802,48 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + ac); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; } -/* - * Locking for serialisation of IO during page faults. This results in a lock - * ordering of: - * - * mmap_lock (MM) - * sb_start_pagefault(vfs, freeze) - * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) - */ static vm_fault_t -__xfs_filemap_fault( +xfs_write_fault_zoned( struct vm_fault *vmf, - unsigned int order, - bool write_fault) + unsigned int order) { - struct inode *inode = file_inode(vmf->vma->vm_file); + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; + int error; + vm_fault_t ret; - trace_xfs_filemap_fault(XFS_I(inode), order, write_fault); + /* + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. + */ + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, + &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} - if (write_fault) - return xfs_write_fault(vmf, order); - if (IS_DAX(inode)) - return xfs_dax_read_fault(vmf, order); - return filemap_fault(vmf); +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); } static inline bool @@ -1506,10 +1858,17 @@ static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { + struct inode *inode = file_inode(vmf->vma->vm_file); + /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, 0, - IS_DAX(file_inode(vmf->vma->vm_file)) && - xfs_is_write_fault(vmf)); + if (IS_DAX(inode)) { + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, 0); + return xfs_dax_read_fault(vmf, 0); + } + + trace_xfs_read_fault(XFS_I(inode), 0); + return filemap_fault(vmf); } static vm_fault_t @@ -1521,15 +1880,16 @@ xfs_filemap_huge_fault( return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, order, - xfs_is_write_fault(vmf)); + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, order); + return xfs_dax_read_fault(vmf, order); } static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, 0, true); + return xfs_write_fault(vmf, 0); } /* @@ -1541,8 +1901,7 @@ static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - - return __xfs_filemap_fault(vmf, 0, true); + return xfs_write_fault(vmf, 0); } static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1595,7 +1954,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e3aaa0555597..044918fbae06 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -64,25 +64,31 @@ xfs_filestream_pick_ag( struct xfs_perag *pag; struct xfs_perag *max_pag = NULL; xfs_extlen_t minlen = *longest; - xfs_extlen_t free = 0, minfree, maxfree = 0; + xfs_extlen_t minfree, maxfree = 0; xfs_agnumber_t agno; bool first_pass = true; - int err; /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; restart: for_each_perag_wrap(mp, start_agno, agno, pag) { + int err; + trace_xfs_filestream_scan(pag, pino); + *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - if (err != -EAGAIN) - break; - /* Couldn't lock the AGF, skip this AG. */ - err = 0; - continue; + if (err == -EAGAIN) { + /* Couldn't lock the AGF, skip this AG. */ + err = 0; + continue; + } + xfs_perag_rele(pag); + if (max_pag) + xfs_perag_rele(max_pag); + return err; } /* Keep track of the AG with the most free blocks. */ @@ -90,7 +96,7 @@ restart: maxfree = pag->pagf_freeblks; if (max_pag) xfs_perag_rele(max_pag); - atomic_inc(&pag->pag_active_ref); + atomic_inc(&pag_group(pag)->xg_active_ref); max_pag = pag; } @@ -107,8 +113,9 @@ restart: !(flags & XFS_PICK_USERDATA) || (flags & XFS_PICK_LOWSPACE))) { /* Break out, retaining the reference on the AG. */ - free = pag->pagf_freeblks; - break; + if (max_pag) + xfs_perag_rele(max_pag); + goto done; } } @@ -116,57 +123,47 @@ restart: atomic_dec(&pag->pagf_fstrms); } - if (err) { - xfs_perag_rele(pag); - if (max_pag) - xfs_perag_rele(max_pag); - return err; + /* + * Allow a second pass to give xfs_bmap_longest_free_extent() another + * attempt at locking AGFs that it might have skipped over before we + * fail. + */ + if (first_pass) { + first_pass = false; + goto restart; } - if (!pag) { - /* - * Allow a second pass to give xfs_bmap_longest_free_extent() - * another attempt at locking AGFs that it might have skipped - * over before we fail. - */ - if (first_pass) { - first_pass = false; - goto restart; - } + /* + * We must be low on data space, so run a final lowspace optimised + * selection pass if we haven't already. + */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + goto restart; + } - /* - * We must be low on data space, so run a final lowspace - * optimised selection pass if we haven't already. - */ - if (!(flags & XFS_PICK_LOWSPACE)) { - flags |= XFS_PICK_LOWSPACE; - goto restart; + /* + * No unassociated AGs are available, so select the AG with the most + * free space, regardless of whether it's already in use by another + * filestream. It none suit, just use whatever AG we can grab. + */ + if (!max_pag) { + for_each_perag_wrap(args->mp, 0, start_agno, pag) { + max_pag = pag; + break; } - /* - * No unassociated AGs are available, so select the AG with the - * most free space, regardless of whether it's already in use by - * another filestream. It none suit, just use whatever AG we can - * grab. - */ - if (!max_pag) { - for_each_perag_wrap(args->mp, 0, start_agno, args->pag) - break; - atomic_inc(&args->pag->pagf_fstrms); - *longest = 0; - } else { - pag = max_pag; - free = maxfree; - atomic_inc(&pag->pagf_fstrms); - } - } else if (max_pag) { - xfs_perag_rele(max_pag); + /* Bail if there are no AGs at all to select from. */ + if (!max_pag) + return -ENOSPC; } - trace_xfs_filestream_pick(pag, pino, free); + pag = max_pag; + atomic_inc(&pag->pagf_fstrms); +done: + trace_xfs_filestream_pick(pag, pino); args->pag = pag; return 0; - } static struct xfs_inode * @@ -225,12 +222,12 @@ xfs_filestream_lookup_association( * down immediately after we mark the lookup as done. */ pag = container_of(mru, struct xfs_fstrm_item, mru)->pag; - atomic_inc(&pag->pag_active_ref); + atomic_inc(&pag_group(pag)->xg_active_ref); xfs_mru_cache_done(mp->m_filestream); trace_xfs_filestream_lookup(pag, ap->ip->i_ino); - ap->blkno = XFS_AGB_TO_FSB(args->mp, pag->pag_agno, 0); + ap->blkno = xfs_agbno_to_fsb(pag, 0); xfs_bmap_adjacent(ap); /* @@ -278,7 +275,7 @@ xfs_filestream_create_association( struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - agno = (item->pag->pag_agno + 1) % mp->m_sb.sb_agcount; + agno = (pag_agno(item->pag) + 1) % mp->m_sb.sb_agcount; xfs_fstrm_free_func(mp, mru); } else if (xfs_is_inode32(mp)) { xfs_agnumber_t rotorstep = xfs_rotorstep; @@ -307,26 +304,19 @@ xfs_filestream_create_association( * for us, so all we need to do here is take another active reference to * the perag for the cached association. * - * If we fail to store the association, we need to drop the fstrms - * counter as well as drop the perag reference we take here for the - * item. We do not need to return an error for this failure - as long as - * we return a referenced AG, the allocation can still go ahead just - * fine. + * If we fail to store the association, we do not need to return an + * error for this failure - as long as we return a referenced AG, the + * allocation can still go ahead just fine. */ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!item) goto out_put_fstrms; - atomic_inc(&args->pag->pag_active_ref); + atomic_inc(&pag_group(args->pag)->xg_active_ref); item->pag = args->pag; - error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); - if (error) - goto out_free_item; + xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); return 0; -out_free_item: - xfs_perag_rele(item->pag); - kfree(item); out_put_fstrms: atomic_dec(&args->pag->pagf_fstrms); return 0; @@ -347,7 +337,6 @@ xfs_filestream_select_ag( struct xfs_alloc_arg *args, xfs_extlen_t *longest) { - struct xfs_mount *mp = args->mp; struct xfs_inode *pip; xfs_ino_t ino = 0; int error = 0; @@ -373,7 +362,7 @@ xfs_filestream_select_ag( return error; out_select: - ap->blkno = XFS_AGB_TO_FSB(mp, args->pag->pag_agno, 0); + ap->blkno = xfs_agbno_to_fsb(args->pag, 0); return 0; } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index ae18ab86e608..414b27a86458 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -25,6 +25,9 @@ #include "xfs_alloc_btree.h" #include "xfs_rtbitmap.h" #include "xfs_ag.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* Convert an xfs_fsmap to an fsmap. */ static void @@ -110,18 +113,18 @@ xfs_fsmap_owner_to_rmap( /* Convert an rmapbt owner into an fsmap owner. */ static int -xfs_fsmap_owner_from_rmap( +xfs_fsmap_owner_from_frec( struct xfs_fsmap *dest, - const struct xfs_rmap_irec *src) + const struct xfs_fsmap_irec *frec) { dest->fmr_flags = 0; - if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { - dest->fmr_owner = src->rm_owner; + if (!XFS_RMAP_NON_INODE_OWNER(frec->owner)) { + dest->fmr_owner = frec->owner; return 0; } dest->fmr_flags |= FMR_OF_SPECIAL_OWNER; - switch (src->rm_owner) { + switch (frec->owner) { case XFS_RMAP_OWN_FS: dest->fmr_owner = XFS_FMR_OWN_FS; break; @@ -158,11 +161,12 @@ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ - struct xfs_perag *pag; /* AG info, if applicable */ + struct xfs_group *group; /* group info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; - xfs_daddr_t end_daddr; /* daddr of high fsmap key */ + /* daddr of high fsmap key, or the last daddr on the device */ + xfs_daddr_t end_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* @@ -203,28 +207,29 @@ STATIC int xfs_getfsmap_is_shared( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - const struct xfs_rmap_irec *rec, + const struct xfs_fsmap_irec *frec, bool *stat) { struct xfs_mount *mp = tp->t_mountp; struct xfs_btree_cur *cur; xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_extlen_t flen = 0; int error; *stat = false; - if (!xfs_has_reflink(mp)) - return 0; - /* rt files will have no perag structure */ - if (!info->pag) + if (!xfs_has_reflink(mp) || !info->group) return 0; - /* Are there any shared blocks here? */ - flen = 0; - cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag); + if (info->group->xg_type == XG_TYPE_RTG) + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group)); + else + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, + to_perag(info->group)); - error = xfs_refcount_find_shared(cur, rec->rm_startblock, - rec->rm_blockcount, &fbno, &flen, false); + /* Are there any shared blocks here? */ + error = xfs_refcount_find_shared(cur, frec->rec_key, + XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen, + false); xfs_btree_del_cursor(cur, error); if (error) @@ -249,15 +254,22 @@ xfs_getfsmap_format( } static inline bool -xfs_getfsmap_rec_before_start( +xfs_getfsmap_frec_before_start( struct xfs_getfsmap_info *info, - const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr) + const struct xfs_fsmap_irec *frec) { if (info->low_daddr != XFS_BUF_DADDR_NULL) - return rec_daddr < info->low_daddr; - if (info->low.rm_blockcount) - return xfs_rmap_compare(rec, &info->low) < 0; + return frec->start_daddr < info->low_daddr; + if (info->low.rm_blockcount) { + struct xfs_rmap_irec rec = { + .rm_startblock = frec->rec_key, + .rm_owner = frec->owner, + .rm_flags = frec->rm_flags, + }; + + return xfs_rmap_compare(&rec, &info->low) < 0; + } + return false; } @@ -270,61 +282,36 @@ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr, - xfs_daddr_t len_daddr) + const struct xfs_fsmap_irec *frec) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; bool shared; - int error; + int error = 0; if (fatal_signal_pending(current)) return -EINTR; - if (len_daddr == 0) - len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount); - /* * Filter out records that start before our startpoint, if the * caller requested that. */ - if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) { - rec_daddr += len_daddr; - if (info->next_daddr < rec_daddr) - info->next_daddr = rec_daddr; - return 0; - } - - /* - * For an info->last query, we're looking for a gap between the last - * mapping emitted and the high key specified by userspace. If the - * user's query spans less than 1 fsblock, then info->high and - * info->low will have the same rm_startblock, which causes rec_daddr - * and next_daddr to be the same. Therefore, use the end_daddr that - * we calculated from userspace's high key to synthesize the record. - * Note that if the btree query found a mapping, there won't be a gap. - */ - if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) - rec_daddr = info->end_daddr; + if (xfs_getfsmap_frec_before_start(info, frec)) + goto out; /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { if (info->head->fmh_entries == UINT_MAX) return -ECANCELED; - if (rec_daddr > info->next_daddr) + if (frec->start_daddr > info->next_daddr) info->head->fmh_entries++; if (info->last) return 0; info->head->fmh_entries++; - - rec_daddr += len_daddr; - if (info->next_daddr < rec_daddr) - info->next_daddr = rec_daddr; - return 0; + goto out; } /* @@ -332,7 +319,7 @@ xfs_getfsmap_helper( * then we've found a gap. Report the gap as being owned by * whatever the caller specified is the missing owner. */ - if (rec_daddr > info->next_daddr) { + if (frec->start_daddr > info->next_daddr) { if (info->head->fmh_entries >= info->head->fmh_count) return -ECANCELED; @@ -340,7 +327,7 @@ xfs_getfsmap_helper( fmr.fmr_physical = info->next_daddr; fmr.fmr_owner = info->missing_owner; fmr.fmr_offset = 0; - fmr.fmr_length = rec_daddr - info->next_daddr; + fmr.fmr_length = frec->start_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; xfs_getfsmap_format(mp, &fmr, info); } @@ -353,23 +340,24 @@ xfs_getfsmap_helper( return -ECANCELED; trace_xfs_fsmap_mapping(mp, info->dev, - info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec); + info->group ? info->group->xg_gno : NULLAGNUMBER, + frec); fmr.fmr_device = info->dev; - fmr.fmr_physical = rec_daddr; - error = xfs_fsmap_owner_from_rmap(&fmr, rec); + fmr.fmr_physical = frec->start_daddr; + error = xfs_fsmap_owner_from_frec(&fmr, frec); if (error) return error; - fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); - fmr.fmr_length = len_daddr; - if (rec->rm_flags & XFS_RMAP_UNWRITTEN) + fmr.fmr_offset = XFS_FSB_TO_BB(mp, frec->offset); + fmr.fmr_length = frec->len_daddr; + if (frec->rm_flags & XFS_RMAP_UNWRITTEN) fmr.fmr_flags |= FMR_OF_PREALLOC; - if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + if (frec->rm_flags & XFS_RMAP_ATTR_FORK) fmr.fmr_flags |= FMR_OF_ATTR_FORK; - if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + if (frec->rm_flags & XFS_RMAP_BMBT_BLOCK) fmr.fmr_flags |= FMR_OF_EXTENT_MAP; if (fmr.fmr_flags == 0) { - error = xfs_getfsmap_is_shared(tp, info, rec, &shared); + error = xfs_getfsmap_is_shared(tp, info, frec, &shared); if (error) return error; if (shared) @@ -378,28 +366,55 @@ xfs_getfsmap_helper( xfs_getfsmap_format(mp, &fmr, info); out: - rec_daddr += len_daddr; - if (info->next_daddr < rec_daddr) - info->next_daddr = rec_daddr; + info->next_daddr = max(info->next_daddr, + frec->start_daddr + frec->len_daddr); return 0; } +static inline int +xfs_getfsmap_group_helper( + struct xfs_getfsmap_info *info, + struct xfs_trans *tp, + struct xfs_group *xg, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + struct xfs_fsmap_irec *frec) +{ + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last) + frec->start_daddr = info->end_daddr + 1; + else + frec->start_daddr = xfs_gbno_to_daddr(xg, startblock); + + frec->len_daddr = XFS_FSB_TO_BB(xg->xg_mount, blockcount); + return xfs_getfsmap_helper(tp, info, frec); +} + /* Transform a rmapbt irec into a fsmap */ STATIC int -xfs_getfsmap_datadev_helper( +xfs_getfsmap_rmapbt_helper( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *rec, void *priv) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_fsmap_irec frec = { + .owner = rec->rm_owner, + .offset = rec->rm_offset, + .rm_flags = rec->rm_flags, + .rec_key = rec->rm_startblock, + }; struct xfs_getfsmap_info *info = priv; - xfs_fsblock_t fsb; - xfs_daddr_t rec_daddr; - - fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); - rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); - return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0); + return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, + rec->rm_startblock, rec->rm_blockcount, &frec); } /* Transform a bnobt irec into a fsmap */ @@ -409,21 +424,14 @@ xfs_getfsmap_datadev_bnobt_helper( const struct xfs_alloc_rec_incore *rec, void *priv) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_NULL, /* "free" */ + .rec_key = rec->ar_startblock, + }; struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; - - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno, - rec->ar_startblock); - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; - - return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0); + return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, + rec->ar_startblock, rec->ar_blockcount, &frec); } /* Set rmap flags based on the getfsmap flags */ @@ -467,12 +475,11 @@ __xfs_getfsmap_datadev( void *priv) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; struct xfs_btree_cur *bt_cur = NULL; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; - xfs_agnumber_t start_ag; - xfs_agnumber_t end_ag; + xfs_agnumber_t start_ag, end_ag; uint64_t eofs; int error = 0; @@ -520,13 +527,13 @@ __xfs_getfsmap_datadev( start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); - for_each_perag_range(mp, start_ag, end_ag, pag) { + while ((pag = xfs_perag_next_range(mp, pag, start_ag, end_ag))) { /* * Set the AG high key from the fsmap high key if this * is the last AG that we're querying. */ - info->pag = pag; - if (pag->pag_agno == end_ag) { + info->group = pag_group(pag); + if (pag_agno(pag) == end_ag) { info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsb); info->high.rm_offset = XFS_BB_TO_FSBT(mp, @@ -548,9 +555,9 @@ __xfs_getfsmap_datadev( if (error) break; - trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno, + trace_xfs_fsmap_low_group_key(mp, info->dev, pag_agno(pag), &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno, + trace_xfs_fsmap_high_group_key(mp, info->dev, pag_agno(pag), &info->high); error = query_fn(tp, info, &bt_cur, priv); @@ -561,7 +568,7 @@ __xfs_getfsmap_datadev( * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (pag->pag_agno == start_ag) + if (pag_agno(pag) == start_ag) memset(&info->low, 0, sizeof(info->low)); /* @@ -569,13 +576,13 @@ __xfs_getfsmap_datadev( * before we drop the reference to the perag when the loop * terminates. */ - if (pag->pag_agno == end_ag) { + if (pag_agno(pag) == end_ag) { info->last = true; error = query_fn(tp, info, &bt_cur, priv); if (error) break; } - info->pag = NULL; + info->group = NULL; } if (bt_cur) @@ -585,9 +592,9 @@ __xfs_getfsmap_datadev( xfs_trans_brelse(tp, info->agf_bp); info->agf_bp = NULL; } - if (info->pag) { - xfs_perag_rele(info->pag); - info->pag = NULL; + if (info->group) { + xfs_perag_rele(pag); + info->group = NULL; } else if (pag) { /* loop termination case */ xfs_perag_rele(pag); @@ -606,13 +613,13 @@ xfs_getfsmap_datadev_rmapbt_query( { /* Report any gap at the end of the last AG. */ if (info->last) - return xfs_getfsmap_datadev_helper(*curpp, &info->high, info); + return xfs_getfsmap_rmapbt_helper(*curpp, &info->high, info); /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag); + to_perag(info->group)); return xfs_rmap_query_range(*curpp, &info->low, &info->high, - xfs_getfsmap_datadev_helper, info); + xfs_getfsmap_rmapbt_helper, info); } /* Execute a getfsmap query against the regular data device rmapbt. */ @@ -643,7 +650,7 @@ xfs_getfsmap_datadev_bnobt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag); + to_perag(info->group)); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], @@ -672,9 +679,12 @@ xfs_getfsmap_logdev( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { + struct xfs_fsmap_irec frec = { + .start_daddr = 0, + .rec_key = 0, + .owner = XFS_RMAP_OWN_LOG, + }; struct xfs_mount *mp = tp->t_mountp; - struct xfs_rmap_irec rmap; - xfs_daddr_t rec_daddr, len_daddr; xfs_fsblock_t start_fsb, end_fsb; uint64_t eofs; @@ -689,51 +699,52 @@ xfs_getfsmap_logdev( if (keys[0].fmr_length > 0) info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); - trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); - trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); + trace_xfs_fsmap_low_linear_key(mp, info->dev, start_fsb); + trace_xfs_fsmap_high_linear_key(mp, info->dev, end_fsb); if (start_fsb > 0) return 0; /* Fabricate an rmap entry for the external log device. */ - rmap.rm_startblock = 0; - rmap.rm_blockcount = mp->m_sb.sb_logblocks; - rmap.rm_owner = XFS_RMAP_OWN_LOG; - rmap.rm_offset = 0; - rmap.rm_flags = 0; - - rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); - len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); - return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); + frec.len_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + return xfs_getfsmap_helper(tp, info, &frec); } #ifdef CONFIG_XFS_RT /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_NULL, /* "free" */ + }; + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_rtblock_t rtbno; - xfs_daddr_t rec_daddr, len_daddr; + xfs_rtblock_t start_rtb = + xfs_rtx_to_rtb(rtg, rec->ar_startext); + uint64_t rtbcount = + xfs_rtbxlen_to_blen(mp, rec->ar_extcount); - rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rec_daddr = XFS_FSB_TO_BB(mp, rtbno); - irec.rm_startblock = rtbno; - - rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); - len_daddr = XFS_FSB_TO_BB(mp, rtbno); - irec.rm_blockcount = rtbno; - - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last) + frec.start_daddr = info->end_daddr + 1; + else + frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb); - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); + frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount); + return xfs_getfsmap_helper(tp, info, &frec); } /* Execute a getfsmap query against the realtime device rtbitmap. */ @@ -743,78 +754,327 @@ xfs_getfsmap_rtdev_rtbitmap( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { - - struct xfs_rtalloc_rec ahigh = { 0 }; struct xfs_mount *mp = tp->t_mountp; - xfs_rtblock_t start_rtb; - xfs_rtblock_t end_rtb; - xfs_rtxnum_t high; + xfs_rtblock_t start_rtbno, end_rtbno; + xfs_rtxnum_t start_rtx, end_rtx; + xfs_rgnumber_t start_rgno, end_rgno; + struct xfs_rtgroup *rtg = NULL; uint64_t eofs; int error; - eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - start_rtb = XFS_BB_TO_FSBT(mp, - keys[0].fmr_physical + keys[0].fmr_length); - end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_UNKNOWN; /* Adjust the low key if we are continuing from where we left off. */ + start_rtbno = xfs_daddr_to_rtb(mp, + keys[0].fmr_physical + keys[0].fmr_length); if (keys[0].fmr_length > 0) { - info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); + info->low_daddr = xfs_rtb_to_daddr(mp, start_rtbno); if (info->low_daddr >= eofs) return 0; } + start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); + start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); + + end_rtbno = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); + + trace_xfs_fsmap_low_linear_key(mp, info->dev, start_rtbno); + trace_xfs_fsmap_high_linear_key(mp, info->dev, end_rtbno); + + end_rtx = -1ULL; + + while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { + if (rtg_rgno(rtg) == end_rgno) + end_rtx = xfs_rtb_to_rtx(mp, + end_rtbno + mp->m_sb.sb_rextsize - 1); + + info->group = rtg_group(rtg); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_range(rtg, tp, start_rtx, end_rtx, + xfs_getfsmap_rtdev_rtbitmap_helper, info); + if (error) + break; + + /* + * Report any gaps at the end of the rtbitmap by simulating a + * zero-length free extent starting at the rtx after the end + * of the query range. + */ + if (rtg_rgno(rtg) == end_rgno) { + struct xfs_rtalloc_rec ahigh = { + .ar_startext = min(end_rtx + 1, + rtg->rtg_extents), + }; + + info->last = true; + error = xfs_getfsmap_rtdev_rtbitmap_helper(rtg, tp, + &ahigh, info); + if (error) + break; + } + + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + info->group = NULL; + start_rtx = 0; + } + + /* loop termination case */ + if (rtg) { + if (info->group) { + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + info->group = NULL; + } + xfs_rtgroup_rele(rtg); + } + + return error; +} + +/* Transform a realtime rmapbt record into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rmapbt_helper( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_fsmap_irec frec = { + .owner = rec->rm_owner, + .offset = rec->rm_offset, + .rm_flags = rec->rm_flags, + .rec_key = rec->rm_startblock, + }; + struct xfs_getfsmap_info *info = priv; + + return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, + rec->rm_startblock, rec->rm_blockcount, &frec); +} + +/* Actually query the rtrmap btree. */ +STATIC int +xfs_getfsmap_rtdev_rmapbt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp) +{ + struct xfs_rtgroup *rtg = to_rtg(info->group); + + /* Query the rtrmapbt */ + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); + *curpp = xfs_rtrmapbt_init_cursor(tp, rtg); + return xfs_rmap_query_range(*curpp, &info->low, &info->high, + xfs_getfsmap_rtdev_rmapbt_helper, info); +} - trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); - trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); +/* Execute a getfsmap query against the realtime device rmapbt. */ +STATIC int +xfs_getfsmap_rtdev_rmapbt( + struct xfs_trans *tp, + const struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_fsmap key0 = *keys; /* struct copy */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = NULL; + struct xfs_btree_cur *bt_cur = NULL; + xfs_daddr_t rtstart_daddr; + xfs_rtblock_t start_rtb; + xfs_rtblock_t end_rtb; + xfs_rgnumber_t start_rg, end_rg; + uint64_t eofs; + int error = 0; - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); + if (key0.fmr_physical >= eofs) + return 0; /* - * Set up query parameters to return free rtextents covering the range - * we want. + * On zoned filesystems with an internal rt volume, the volume comes + * immediately after the end of the data volume. However, the + * xfs_rtblock_t address space is relative to the start of the data + * device, which means that the first @rtstart fsblocks do not actually + * point anywhere. If a fsmap query comes in with the low key starting + * below @rtstart, report it as "owned by filesystem". */ - high = xfs_rtb_to_rtxup(mp, end_rtb); - error = xfs_rtalloc_query_range(mp, tp, xfs_rtb_to_rtx(mp, start_rtb), - high, xfs_getfsmap_rtdev_rtbitmap_helper, info); - if (error) - goto err; + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); + if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_FS, + .len_daddr = rtstart_daddr, + }; + + /* + * Adjust the start of the query range if we're picking up from + * a previous round, and only emit the record if we haven't + * already gone past. + */ + key0.fmr_physical += key0.fmr_length; + if (key0.fmr_physical < rtstart_daddr) { + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + + key0.fmr_physical = rtstart_daddr; + } + + /* Zero the other fields to avoid further adjustments. */ + key0.fmr_owner = 0; + key0.fmr_offset = 0; + key0.fmr_length = 0; + } + + start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + info->missing_owner = XFS_FMR_OWN_FREE; /* - * Report any gaps at the end of the rtbitmap by simulating a null - * rmap starting at the block after the end of the query range. + * Convert the fsmap low/high keys to rtgroup based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the rtgroup. */ - info->last = true; - ahigh.ar_startext = min(mp->m_sb.sb_rextents, high); - - error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &key0); if (error) - goto err; -err: - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + return error; + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length); + xfs_getfsmap_set_irec_flags(&info->low, &key0); + + /* Adjust the low key if we are continuing from where we left off. */ + if (info->low.rm_blockcount == 0) { + /* No previous record from which to continue */ + } else if (rmap_not_shareable(mp, &info->low)) { + /* Last record seen was an unshareable extent */ + info->low.rm_owner = 0; + info->low.rm_offset = 0; + + start_rtb += info->low.rm_blockcount; + if (xfs_rtb_to_daddr(mp, start_rtb) >= eofs) + return 0; + } else { + /* Last record seen was a shareable file data extent */ + info->low.rm_offset += info->low.rm_blockcount; + } + info->low.rm_startblock = xfs_rtb_to_rgbno(mp, start_rtb); + + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + + start_rg = xfs_rtb_to_rgno(mp, start_rtb); + end_rg = xfs_rtb_to_rgno(mp, end_rtb); + + while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rg, end_rg))) { + /* + * Set the rtgroup high key from the fsmap high key if this + * is the last rtgroup that we're querying. + */ + info->group = rtg_group(rtg); + if (rtg_rgno(rtg) == end_rg) { + info->high.rm_startblock = + xfs_rtb_to_rgbno(mp, end_rtb); + info->high.rm_offset = + XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + break; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + } + + if (bt_cur) { + xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group), + XFS_RTGLOCK_RMAP | + XFS_RTGLOCK_REFCOUNT); + xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR); + bt_cur = NULL; + } + + trace_xfs_fsmap_low_group_key(mp, info->dev, rtg_rgno(rtg), + &info->low); + trace_xfs_fsmap_high_group_key(mp, info->dev, rtg_rgno(rtg), + &info->high); + + error = xfs_getfsmap_rtdev_rmapbt_query(tp, info, &bt_cur); + if (error) + break; + + /* + * Set the rtgroup low key to the start of the rtgroup prior to + * moving on to the next rtgroup. + */ + if (rtg_rgno(rtg) == start_rg) + memset(&info->low, 0, sizeof(info->low)); + + /* + * If this is the last rtgroup, report any gap at the end of it + * before we drop the reference to the perag when the loop + * terminates. + */ + if (rtg_rgno(rtg) == end_rg) { + info->last = true; + error = xfs_getfsmap_rtdev_rmapbt_helper(bt_cur, + &info->high, info); + if (error) + break; + } + info->group = NULL; + } + + if (bt_cur) { + xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group), + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); + xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR); + } + + /* loop termination case */ + if (rtg) { + info->group = NULL; + xfs_rtgroup_rele(rtg); + } + return error; } #endif /* CONFIG_XFS_RT */ +static uint32_t +xfs_getfsmap_device( + struct xfs_mount *mp, + enum xfs_device dev) +{ + if (mp->m_sb.sb_rtstart) + return dev; + + switch (dev) { + case XFS_DEV_DATA: + return new_encode_dev(mp->m_ddev_targp->bt_dev); + case XFS_DEV_LOG: + return new_encode_dev(mp->m_logdev_targp->bt_dev); + case XFS_DEV_RT: + if (!mp->m_rtdev_targp) + break; + return new_encode_dev(mp->m_rtdev_targp->bt_dev); + } + + return -1; +} + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( struct xfs_mount *mp, struct xfs_fsmap *fm) { - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) - return true; - if (mp->m_logdev_targp && - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) - return true; - if (mp->m_rtdev_targp && - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) - return true; - return false; + return fm->fmr_device == 0 || + fm->fmr_device == UINT_MAX || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || + (mp->m_rtdev_targp && + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); } /* Ensure that the low key is less than the high key. */ @@ -898,7 +1158,10 @@ xfs_getfsmap( struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; - struct xfs_getfsmap_info info = { NULL }; + struct xfs_getfsmap_info info = { + .fsmap_recs = fsmap_recs, + .head = head, + }; bool use_rmap; int i; int error = 0; @@ -918,7 +1181,7 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else @@ -926,14 +1189,21 @@ xfs_getfsmap( if (mp->m_logdev_targp != mp->m_ddev_targp) { handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT - if (mp->m_rtdev_targp) { + /* + * For zoned file systems there is no rtbitmap, so only support fsmap + * if the callers is privileged enough to use the full rmap version. + */ + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); - handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); + if (use_rmap) + handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; + else + handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } #endif /* CONFIG_XFS_RT */ @@ -963,9 +1233,6 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.end_daddr = XFS_BUF_DADDR_NULL; - info.fsmap_recs = fsmap_recs; - info.head = head; /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { @@ -978,17 +1245,23 @@ xfs_getfsmap( break; /* - * If this device number matches the high key, we have - * to pass the high key to the handler to limit the - * query results. If the device number exceeds the - * low key, zero out the low key so that we get - * everything from the beginning. + * If this device number matches the high key, we have to pass + * the high key to the handler to limit the query results, and + * set the end_daddr so that we can synthesize records at the + * end of the query range or device. */ if (handlers[i].dev == head->fmh_keys[1].fmr_device) { dkeys[1] = head->fmh_keys[1]; info.end_daddr = min(handlers[i].nr_sectors - 1, dkeys[1].fmr_physical); + } else { + info.end_daddr = handlers[i].nr_sectors - 1; } + + /* + * If the device number exceeds the low key, zero out the low + * key so that we get everything from the beginning. + */ if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); @@ -1003,7 +1276,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; - info.pag = NULL; + info.group = NULL; info.low_daddr = XFS_BUF_DADDR_NULL; info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); @@ -1016,7 +1289,13 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - head->fmh_oflags = FMH_OF_DEV_T; + + /* + * For internal RT device we need to report different synthetic devices + * for a single physical device, and thus can't report the actual dev_t. + */ + if (!mp->m_sb.sb_rtstart) + head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index a0bcc38486a5..06e492fd479d 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -28,6 +28,21 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; +/* internal fsmap record format */ +struct xfs_fsmap_irec { + xfs_daddr_t start_daddr; + xfs_daddr_t len_daddr; + uint64_t owner; /* extent owner */ + uint64_t offset; /* offset within the owner */ + unsigned int rm_flags; /* rmap state flags */ + + /* + * rmapbt startblock corresponding to start_daddr, if the record came + * from an rmap btree. + */ + xfs_agblock_t rec_key; +}; + int xfs_ioc_getfsmap(struct xfs_inode *ip, struct fsmap_head __user *arg); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b247d895c276..0ada73569394 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -21,6 +21,10 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_metafile.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -107,12 +111,18 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) { error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) return error; xfs_buf_relse(bp); } + /* Make sure the new fs size won't cause problems with the log. */ + error = xfs_growfs_check_rtgeom(mp, nb, mp->m_sb.sb_rblocks, + mp->m_sb.sb_rextsize); + if (error) + return error; + nb_div = nb; nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) @@ -162,9 +172,7 @@ xfs_growfs_data_private( error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, last_pag, &lastag_extended); } else { - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, - "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); - + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SHRINK); error = xfs_ag_shrink_space(last_pag, &tp, -delta); } xfs_perag_put(last_pag); @@ -222,7 +230,12 @@ xfs_growfs_data_private( error = xfs_fs_reserve_ag_blocks(mp); if (error == -ENOSPC) error = 0; + + /* Compute new maxlevels for rt btrees. */ + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); } + return error; out_trans_cancel: @@ -288,24 +301,30 @@ xfs_growfs_data( struct xfs_mount *mp, struct xfs_growfs_data *in) { - int error = 0; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; + /* we can't grow the data section when an internal RT section exists */ + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) { + error = -EINVAL; + goto out_unlock; + } + /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); if (error) - goto out_error; + goto out_unlock; } if (in->newblocks != mp->m_sb.sb_dblocks) { error = xfs_growfs_data_private(mp, in); if (error) - goto out_error; + goto out_unlock; } /* Post growfs calculations needed to reflect new state in operations */ @@ -319,13 +338,12 @@ xfs_growfs_data( /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); -out_error: /* - * Increment the generation unconditionally, the error could be from - * updating the secondary superblocks, in which case the new size - * is live already. + * Increment the generation unconditionally, after trying to update the + * secondary superblocks, as the new size is live already at this point. */ mp->m_generation++; +out_unlock: mutex_unlock(&mp->m_growlock); return error; } @@ -354,6 +372,7 @@ xfs_growfs_log( int xfs_reserve_blocks( struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t request) { int64_t lcounter, delta; @@ -361,6 +380,8 @@ xfs_reserve_blocks( int64_t free; int error = 0; + ASSERT(ctr < XC_FREE_NR); + /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -379,16 +400,16 @@ xfs_reserve_blocks( * counters directly since we shouldn't have any problems unreserving * space. */ - if (mp->m_resblks > request) { - lcounter = mp->m_resblks_avail - request; + if (mp->m_free[ctr].res_total > request) { + lcounter = mp->m_free[ctr].res_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; - mp->m_resblks_avail -= lcounter; + mp->m_free[ctr].res_avail -= lcounter; } - mp->m_resblks = request; + mp->m_free[ctr].res_total = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -397,7 +418,7 @@ xfs_reserve_blocks( /* * If the request is larger than the current reservation, reserve the - * blocks before we update the reserve counters. Sample m_fdblocks and + * blocks before we update the reserve counters. Sample m_free and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from @@ -407,10 +428,10 @@ xfs_reserve_blocks( * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ - free = percpu_counter_sum(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - delta = request - mp->m_resblks; - mp->m_resblks = request; + free = xfs_sum_freecounter_raw(mp, ctr) - + xfs_freecounter_unavailable(mp, ctr); + delta = request - mp->m_free[ctr].res_total; + mp->m_free[ctr].res_total = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -424,9 +445,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_dec_fdblocks(mp, fdblks_delta, 0); + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); if (!error) - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -528,13 +549,12 @@ int xfs_fs_reserve_ag_blocks( struct xfs_mount *mp) { - xfs_agnumber_t agno; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; int error = 0; int err2; mp->m_finobt_nores = false; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { err2 = xfs_ag_resv_init(pag, NULL); if (err2 && !error) error = err2; @@ -544,6 +564,17 @@ xfs_fs_reserve_ag_blocks( xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", error); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + err2 = xfs_metafile_resv_init(mp); + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!error) + error = err2; } return error; @@ -556,9 +587,9 @@ void xfs_fs_unreserve_ag_blocks( struct xfs_mount *mp) { - xfs_agnumber_t agno; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; - for_each_perag(mp, agno, pag) + xfs_metafile_resv_free(mp); + while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 3e2f73bcf831..9d23c361ef56 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,8 @@ int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt, + uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f18fec0adf66..f6f628c01feb 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -23,8 +23,6 @@ xfs_param_t xfs_params = { .inherit_sync = { 0, 1, 1 }, .inherit_nodump = { 0, 1, 1 }, .inherit_noatim = { 0, 1, 1 }, - .xfs_buf_timer = { 100/2, 1*100, 30*100 }, - .xfs_buf_age = { 1*100, 15*100, 7200*100}, .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index 49e5e5f04e60..f19fce557354 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -85,22 +85,23 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f = EMPTY_FD; struct path path; int error; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - f = fdget(hreq->fd); - if (!fd_file(f)) + CLASS(fd, f)(hreq->fd); + + if (fd_empty(f)) return -EBADF; - inode = file_inode(fd_file(f)); + path = fd_file(f)->f_path; + path_get(&path); } else { error = user_path_at(AT_FDCWD, hreq->path, 0, &path); if (error) return error; - inode = d_inode(path.dentry); } + inode = d_inode(path.dentry); ip = XFS_I(inode); /* @@ -134,10 +135,7 @@ xfs_find_handle( error = 0; out_put: - if (cmd == XFS_IOC_FD_TO_HANDLE) - fdput(f); - else - path_put(&path); + path_put(&path); return error; } diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 10f116d093a2..7c541fb373d5 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -18,6 +18,22 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_quota_defs.h" +#include "xfs_rtgroup.h" + +static void +xfs_health_unmount_group( + struct xfs_group *xg, + bool *warn) +{ + unsigned int sick = 0; + unsigned int checked = 0; + + xfs_group_measure_sickness(xg, &sick, &checked); + if (sick) { + trace_xfs_group_unfixed_corruption(xg, sick); + *warn = true; + } +} /* * Warn about metadata corruption that we detected but haven't fixed, and @@ -28,8 +44,8 @@ void xfs_health_unmount( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; unsigned int sick = 0; unsigned int checked = 0; bool warn = false; @@ -38,20 +54,12 @@ xfs_health_unmount( return; /* Measure AG corruption levels. */ - for_each_perag(mp, agno, pag) { - xfs_ag_measure_sickness(pag, &sick, &checked); - if (sick) { - trace_xfs_ag_unfixed_corruption(mp, agno, sick); - warn = true; - } - } + while ((pag = xfs_perag_next(mp, pag))) + xfs_health_unmount_group(pag_group(pag), &warn); - /* Measure realtime volume corruption levels. */ - xfs_rt_measure_sickness(mp, &sick, &checked); - if (sick) { - trace_xfs_rt_unfixed_corruption(mp, sick); - warn = true; - } + /* Measure realtime group corruption levels. */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_health_unmount_group(rtg_group(rtg), &warn); /* * Measure fs corruption and keep the sample around for the warning. @@ -150,65 +158,6 @@ xfs_fs_measure_sickness( spin_unlock(&mp->m_sb_lock); } -/* Mark unhealthy realtime metadata. */ -void -xfs_rt_mark_sick( - struct xfs_mount *mp, - unsigned int mask) -{ - ASSERT(!(mask & ~XFS_SICK_RT_ALL)); - trace_xfs_rt_mark_sick(mp, mask); - - spin_lock(&mp->m_sb_lock); - mp->m_rt_sick |= mask; - spin_unlock(&mp->m_sb_lock); -} - -/* Mark realtime metadata as having been checked and found unhealthy by fsck. */ -void -xfs_rt_mark_corrupt( - struct xfs_mount *mp, - unsigned int mask) -{ - ASSERT(!(mask & ~XFS_SICK_RT_ALL)); - trace_xfs_rt_mark_corrupt(mp, mask); - - spin_lock(&mp->m_sb_lock); - mp->m_rt_sick |= mask; - mp->m_rt_checked |= mask; - spin_unlock(&mp->m_sb_lock); -} - -/* Mark a realtime metadata healed. */ -void -xfs_rt_mark_healthy( - struct xfs_mount *mp, - unsigned int mask) -{ - ASSERT(!(mask & ~XFS_SICK_RT_ALL)); - trace_xfs_rt_mark_healthy(mp, mask); - - spin_lock(&mp->m_sb_lock); - mp->m_rt_sick &= ~mask; - if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY)) - mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY; - mp->m_rt_checked |= mask; - spin_unlock(&mp->m_sb_lock); -} - -/* Sample which realtime metadata are unhealthy. */ -void -xfs_rt_measure_sickness( - struct xfs_mount *mp, - unsigned int *sick, - unsigned int *checked) -{ - spin_lock(&mp->m_sb_lock); - *sick = mp->m_rt_sick; - *checked = mp->m_rt_checked; - spin_unlock(&mp->m_sb_lock); -} - /* Mark unhealthy per-ag metadata given a raw AG number. */ void xfs_agno_mark_sick( @@ -226,63 +175,95 @@ xfs_agno_mark_sick( xfs_perag_put(pag); } +static inline void +xfs_group_check_mask( + struct xfs_group *xg, + unsigned int mask) +{ + if (xg->xg_type == XG_TYPE_AG) + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); + else + ASSERT(!(mask & ~XFS_SICK_RG_ALL)); +} + /* Mark unhealthy per-ag metadata. */ void -xfs_ag_mark_sick( - struct xfs_perag *pag, +xfs_group_mark_sick( + struct xfs_group *xg, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_ALL)); - trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); + xfs_group_check_mask(xg, mask); + trace_xfs_group_mark_sick(xg, mask); - spin_lock(&pag->pag_state_lock); - pag->pag_sick |= mask; - spin_unlock(&pag->pag_state_lock); + spin_lock(&xg->xg_state_lock); + xg->xg_sick |= mask; + spin_unlock(&xg->xg_state_lock); } -/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */ +/* + * Mark per-group metadata as having been checked and found unhealthy by fsck. + */ void -xfs_ag_mark_corrupt( - struct xfs_perag *pag, +xfs_group_mark_corrupt( + struct xfs_group *xg, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_ALL)); - trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask); + xfs_group_check_mask(xg, mask); + trace_xfs_group_mark_corrupt(xg, mask); - spin_lock(&pag->pag_state_lock); - pag->pag_sick |= mask; - pag->pag_checked |= mask; - spin_unlock(&pag->pag_state_lock); + spin_lock(&xg->xg_state_lock); + xg->xg_sick |= mask; + xg->xg_checked |= mask; + spin_unlock(&xg->xg_state_lock); } -/* Mark per-ag metadata ok. */ +/* + * Mark per-group metadata ok. + */ void -xfs_ag_mark_healthy( - struct xfs_perag *pag, +xfs_group_mark_healthy( + struct xfs_group *xg, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_ALL)); - trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask); - - spin_lock(&pag->pag_state_lock); - pag->pag_sick &= ~mask; - if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY)) - pag->pag_sick &= ~XFS_SICK_AG_SECONDARY; - pag->pag_checked |= mask; - spin_unlock(&pag->pag_state_lock); + xfs_group_check_mask(xg, mask); + trace_xfs_group_mark_healthy(xg, mask); + + spin_lock(&xg->xg_state_lock); + xg->xg_sick &= ~mask; + if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY)) + xg->xg_sick &= ~XFS_SICK_AG_SECONDARY; + xg->xg_checked |= mask; + spin_unlock(&xg->xg_state_lock); } /* Sample which per-ag metadata are unhealthy. */ void -xfs_ag_measure_sickness( - struct xfs_perag *pag, +xfs_group_measure_sickness( + struct xfs_group *xg, unsigned int *sick, unsigned int *checked) { - spin_lock(&pag->pag_state_lock); - *sick = pag->pag_sick; - *checked = pag->pag_checked; - spin_unlock(&pag->pag_state_lock); + spin_lock(&xg->xg_state_lock); + *sick = xg->xg_sick; + *checked = xg->xg_checked; + spin_unlock(&xg->xg_state_lock); +} + +/* Mark unhealthy per-rtgroup metadata given a raw rt group number. */ +void +xfs_rgno_mark_sick( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + unsigned int mask) +{ + struct xfs_rtgroup *rtg = xfs_rtgroup_get(mp, rgno); + + /* per-rtgroup structure not set up yet? */ + if (!rtg) + return; + + xfs_group_mark_sick(rtg_group(rtg), mask); + xfs_rtgroup_put(rtg); } /* Mark the unhealthy parts of an inode. */ @@ -369,6 +350,9 @@ struct ioctl_sick_map { unsigned int ioctl_mask; }; +#define for_each_sick_map(map, m) \ + for ((m) = (map); (m) < (map) + ARRAY_SIZE(map); (m)++) + static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS}, { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, @@ -376,13 +360,13 @@ static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS }, - { 0, 0 }, + { XFS_SICK_FS_METADIR, XFS_FSOP_GEOM_SICK_METADIR }, + { XFS_SICK_FS_METAPATH, XFS_FSOP_GEOM_SICK_METAPATH }, }; static const struct ioctl_sick_map rt_map[] = { - { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, - { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, - { 0, 0 }, + { XFS_SICK_RG_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, + { XFS_SICK_RG_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, }; static inline void @@ -404,6 +388,7 @@ xfs_fsop_geom_health( struct xfs_mount *mp, struct xfs_fsop_geom *geo) { + struct xfs_rtgroup *rtg = NULL; const struct ioctl_sick_map *m; unsigned int sick; unsigned int checked; @@ -412,12 +397,14 @@ xfs_fsop_geom_health( geo->checked = 0; xfs_fs_measure_sickness(mp, &sick, &checked); - for (m = fs_map; m->sick_mask; m++) + for_each_sick_map(fs_map, m) xfgeo_health_tick(geo, sick, checked, m); - xfs_rt_measure_sickness(mp, &sick, &checked); - for (m = rt_map; m->sick_mask; m++) - xfgeo_health_tick(geo, sick, checked, m); + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + for_each_sick_map(rt_map, m) + xfgeo_health_tick(geo, sick, checked, m); + } } static const struct ioctl_sick_map ag_map[] = { @@ -432,7 +419,6 @@ static const struct ioctl_sick_map ag_map[] = { { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES }, - { 0, 0 }, }; /* Fill out ag geometry health info. */ @@ -448,8 +434,8 @@ xfs_ag_geom_health( ageo->ag_sick = 0; ageo->ag_checked = 0; - xfs_ag_measure_sickness(pag, &sick, &checked); - for (m = ag_map; m->sick_mask; m++) { + xfs_group_measure_sickness(pag_group(pag), &sick, &checked); + for_each_sick_map(ag_map, m) { if (checked & m->sick_mask) ageo->ag_checked |= m->ioctl_mask; if (sick & m->sick_mask) @@ -457,6 +443,36 @@ xfs_ag_geom_health( } } +static const struct ioctl_sick_map rtgroup_map[] = { + { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER }, + { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, + { XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY }, + { XFS_SICK_RG_RMAPBT, XFS_RTGROUP_GEOM_SICK_RMAPBT }, + { XFS_SICK_RG_REFCNTBT, XFS_RTGROUP_GEOM_SICK_REFCNTBT }, +}; + +/* Fill out rtgroup geometry health info. */ +void +xfs_rtgroup_geom_health( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + rgeo->rg_sick = 0; + rgeo->rg_checked = 0; + + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + for_each_sick_map(rtgroup_map, m) { + if (checked & m->sick_mask) + rgeo->rg_checked |= m->ioctl_mask; + if (sick & m->sick_mask) + rgeo->rg_sick |= m->ioctl_mask; + } +} + static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE }, { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD }, @@ -471,7 +487,6 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE }, - { 0, 0 }, }; /* Fill out bulkstat health info. */ @@ -488,7 +503,7 @@ xfs_bulkstat_health( bs->bs_checked = 0; xfs_inode_measure_sickness(ip, &sick, &checked); - for (m = ino_map; m->sick_mask; m++) { + for_each_sick_map(ino_map, m) { if (checked & m->sick_mask) bs->bs_checked |= m->ioctl_mask; if (sick & m->sick_mask) @@ -527,24 +542,13 @@ void xfs_btree_mark_sick( struct xfs_btree_cur *cur) { - switch (cur->bc_ops->type) { - case XFS_BTREE_TYPE_MEM: - /* no health state tracking for ephemeral btrees */ - return; - case XFS_BTREE_TYPE_AG: + if (xfs_btree_is_bmap(cur->bc_ops)) { + xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork); + /* no health state tracking for ephemeral btrees */ + } else if (cur->bc_ops->type != XFS_BTREE_TYPE_MEM) { + ASSERT(cur->bc_group); ASSERT(cur->bc_ops->sick_mask); - xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); - return; - case XFS_BTREE_TYPE_INODE: - if (xfs_btree_is_bmap(cur->bc_ops)) { - xfs_bmap_mark_sick(cur->bc_ino.ip, - cur->bc_ino.whichfork); - return; - } - fallthrough; - default: - ASSERT(0); - return; + xfs_group_mark_sick(cur->bc_group, cur->bc_ops->sick_mask); } } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6b119a7a324f..726e29b837e6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -25,6 +25,9 @@ #include "xfs_ag.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" +#include "xfs_metafile.h" #include <linux/iversion.h> @@ -204,7 +207,7 @@ xfs_reclaim_work_queue( { rcu_read_lock(); - if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { + if (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) { queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } @@ -219,16 +222,15 @@ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); if (!xfs_is_blockgc_enabled(mp)) return; rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_blockgc_wq, - &pag->pag_blockgc_work, - msecs_to_jiffies(xfs_blockgc_secs * 1000)); + queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, + secs_to_jiffies(xfs_blockgc_secs)); rcu_read_unlock(); } @@ -239,7 +241,6 @@ xfs_perag_set_inode_tag( xfs_agino_t agino, unsigned int tag) { - struct xfs_mount *mp = pag->pag_mount; bool was_tagged; lockdep_assert_held(&pag->pag_ici_lock); @@ -253,13 +254,13 @@ xfs_perag_set_inode_tag( if (was_tagged) return; - /* propagate the tag up into the perag radix tree */ - xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); + /* propagate the tag up into the pag xarray tree */ + xfs_group_set_mark(pag_group(pag), ici_tag_to_mark(tag)); /* start background work */ switch (tag) { case XFS_ICI_RECLAIM_TAG: - xfs_reclaim_work_queue(mp); + xfs_reclaim_work_queue(pag_mount(pag)); break; case XFS_ICI_BLOCKGC_TAG: xfs_blockgc_queue(pag); @@ -276,8 +277,6 @@ xfs_perag_clear_inode_tag( xfs_agino_t agino, unsigned int tag) { - struct xfs_mount *mp = pag->pag_mount; - lockdep_assert_held(&pag->pag_ici_lock); /* @@ -295,9 +294,8 @@ xfs_perag_clear_inode_tag( if (radix_tree_tagged(&pag->pag_ici_root, tag)) return; - /* clear the tag from the perag radix tree */ - xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); - + /* clear the tag from the pag xarray */ + xfs_group_clear_mark(pag_group(pag), ici_tag_to_mark(tag)); trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); } @@ -310,22 +308,9 @@ xfs_perag_grab_next_tag( struct xfs_perag *pag, int tag) { - unsigned long index = 0; - - if (pag) { - index = pag->pag_agno + 1; - xfs_perag_rele(pag); - } - - rcu_read_lock(); - pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag)); - if (pag) { - trace_xfs_perag_grab_next_tag(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - } - rcu_read_unlock(); - return pag; + return to_perag(xfs_group_grab_next_mark(mp, + pag ? pag_group(pag) : NULL, + ici_tag_to_mark(tag), XG_TYPE_AG)); } /* @@ -847,6 +832,77 @@ out_error_or_again: } /* + * Get a metadata inode. + * + * The metafile type must match the file mode exactly, and for files in the + * metadata directory tree, it must match the inode's metatype exactly. + */ +int +xfs_trans_metafile_iget( + struct xfs_trans *tp, + xfs_ino_t ino, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + umode_t mode; + int error; + + error = xfs_iget(mp, tp, ino, 0, 0, &ip); + if (error == -EFSCORRUPTED || error == -EINVAL) + goto whine; + if (error) + return error; + + if (VFS_I(ip)->i_nlink == 0) + goto bad_rele; + + if (metafile_type == XFS_METAFILE_DIR) + mode = S_IFDIR; + else + mode = S_IFREG; + if (inode_wrong_type(VFS_I(ip), mode)) + goto bad_rele; + if (xfs_has_metadir(mp)) { + if (!xfs_is_metadir_inode(ip)) + goto bad_rele; + if (metafile_type != ip->i_metatype) + goto bad_rele; + } + + *ipp = ip; + return 0; +bad_rele: + xfs_irele(ip); +whine: + xfs_err(mp, "metadata inode 0x%llx type %u is corrupt", ino, + metafile_type); + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; +} + +/* Grab a metadata file if the caller doesn't already have a transaction. */ +int +xfs_metafile_iget( + struct xfs_mount *mp, + xfs_ino_t ino, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); + xfs_trans_cancel(tp); + return error; +} + +/* * Grab the inode for reclaim exclusively. * * We have found this inode via a lookup under RCU, so the inode may have @@ -1014,7 +1070,7 @@ xfs_reclaim_inodes( if (xfs_want_reclaim_sick(mp)) icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; - while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { + while (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) { xfs_ail_push_all_sync(mp->m_ail); xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } @@ -1056,7 +1112,7 @@ long xfs_reclaim_inodes_count( struct xfs_mount *mp) { - XA_STATE (xas, &mp->m_perags, 0); + XA_STATE (xas, &mp->m_groups[XG_TYPE_AG].xa, 0); long reclaimable = 0; struct xfs_perag *pag; @@ -1401,13 +1457,12 @@ void xfs_blockgc_stop( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; if (!xfs_clear_blockgc_enabled(mp)) return; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) cancel_delayed_work_sync(&pag->pag_blockgc_work); trace_xfs_blockgc_stop(mp, __return_address); } @@ -1499,7 +1554,7 @@ xfs_blockgc_worker( { struct xfs_perag *pag = container_of(to_delayed_work(work), struct xfs_perag, pag_blockgc_work); - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; trace_xfs_blockgc_worker(mp, __return_address); @@ -1507,7 +1562,7 @@ xfs_blockgc_worker( error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", - pag->pag_agno, error); + pag_agno(pag), error); xfs_blockgc_queue(pag); } @@ -1548,8 +1603,7 @@ xfs_blockgc_flush_all( * queued, it will not be requeued. Then flush whatever is left. */ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) - mod_delayed_work(pag->pag_mount->m_blockgc_wq, - &pag->pag_blockgc_work, 0); + mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0); while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) flush_delayed_work(&pag->pag_blockgc_work); @@ -1688,7 +1742,7 @@ xfs_icwalk_ag( enum xfs_icwalk_goal goal, struct xfs_icwalk *icw) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); uint32_t first_index; int last_error = 0; int skipped; @@ -1741,7 +1795,7 @@ restart: * us to see this inode, so another lookup from the * same index will not find it again. */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) @@ -2019,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_REALTIME_INODE(ip)) + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) return false; - if (__percpu_counter_compare(&mp->m_frextents, + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; @@ -2050,7 +2104,7 @@ xfs_inodegc_want_queue_work( if (items > mp->m_ino_geo.inodes_per_cluster) return true; - if (__percpu_counter_compare(&mp->m_fdblocks, + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, mp->m_low_space[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bcc277fc0a83..ee3e0f284287 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -43,6 +43,7 @@ #include "xfs_parent.h" #include "xfs_xattr.h" #include "xfs_inode_util.h" +#include "xfs_metafile.h" struct kmem_cache *xfs_inode_cache; @@ -341,8 +342,7 @@ xfs_lock_inumorder( { uint class = 0; - ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | - XFS_ILOCK_RTSUM))); + ASSERT(!(lock_mode & XFS_ILOCK_PARENT)); ASSERT(xfs_lockdep_subclass_ok(subclass)); if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { @@ -554,8 +554,20 @@ xfs_lookup( if (error) goto out_free_name; + /* + * Fail if a directory entry in the regular directory tree points to + * a metadata file. + */ + if (XFS_IS_CORRUPT(dp->i_mount, xfs_is_metadir_inode(*ipp))) { + xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR); + error = -EFSCORRUPTED; + goto out_irele; + } + return 0; +out_irele: + xfs_irele(*ipp); out_free_name: if (ci_name) kfree(ci_name->name); @@ -1295,7 +1307,7 @@ xfs_inode_needs_inactive( return false; /* Metadata inodes require explicit resource cleanup. */ - if (xfs_is_metadata_inode(ip)) + if (xfs_is_internal_inode(ip)) return false; /* Want to clean out the cow blocks if there are any. */ @@ -1388,12 +1400,15 @@ xfs_inactive( goto out; /* Metadata inodes require explicit resource cleanup. */ - if (xfs_is_metadata_inode(ip)) + if (xfs_is_internal_inode(ip)) goto out; /* Try to clean out the cow blocks if there are any. */ - if (xfs_inode_has_cow_data(ip)) - xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (xfs_inode_has_cow_data(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (error) + goto out; + } if (VFS_I(ip)->i_nlink != 0) { /* @@ -1409,7 +1424,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) + xfs_inode_has_filedata(ip))) truncate = 1; if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { @@ -1514,9 +1529,8 @@ xfs_iunlink_reload_next( xfs_agino_t next_agino) { struct xfs_perag *pag = agibp->b_pag; - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode *next_ip = NULL; - xfs_ino_t ino; int error; ASSERT(next_agino != NULLAGINO); @@ -1530,7 +1544,7 @@ xfs_iunlink_reload_next( xfs_info_ratelimited(mp, "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", - next_agino, pag->pag_agno); + next_agino, pag_agno(pag)); /* * Use an untrusted lookup just to be cautious in case the AGI has been @@ -1538,8 +1552,8 @@ xfs_iunlink_reload_next( * but we'd rather shut down now since we're already running in a weird * situation. */ - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); - error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); + error = xfs_iget(mp, tp, xfs_agino_to_ino(pag, next_agino), + XFS_IGET_UNTRUSTED, 0, &next_ip); if (error) { xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return error; @@ -1573,7 +1587,7 @@ xfs_ifree_mark_inode_stale( struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode_log_item *iip; struct xfs_inode *ip; @@ -1707,8 +1721,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED, &bp); + mp->m_bsize * igeo->blocks_per_cluster, 0, &bp); if (error) return error; @@ -2371,7 +2384,16 @@ xfs_iflush( __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto flush_out; } - if (S_ISREG(VFS_I(ip)->i_mode)) { + if (ip->i_df.if_format == XFS_DINODE_FMT_META_BTREE) { + if (!S_ISREG(VFS_I(ip)->i_mode) || + !(ip->i_diflags2 & XFS_DIFLAG2_METADATA)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad %s meta btree inode %Lu, ptr "PTR_FMT, + __func__, xfs_metafile_type_str(ip->i_metatype), + ip->i_ino, ip); + goto flush_out; + } + } else if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && ip->i_df.if_format != XFS_DINODE_FMT_BTREE, @@ -2411,6 +2433,14 @@ xfs_iflush( goto flush_out; } + if (xfs_inode_has_attr_fork(ip) && + ip->i_af.if_format == XFS_DINODE_FMT_META_BTREE) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: meta btree in inode %Lu attr fork, ptr "PTR_FMT, + __func__, ip->i_ino, ip); + goto flush_out; + } + /* * Inode item log recovery for v2 inodes are dependent on the flushiter * count for correct sequencing. We bump the flush iteration count so @@ -2704,21 +2734,16 @@ xfs_mmaplock_two_inodes_and_break_dax_layout( struct xfs_inode *ip2) { int error; - bool retry; - struct page *page; if (ip1->i_ino > ip2->i_ino) swap(ip1, ip2); again: - retry = false; /* Lock the first inode */ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - error = xfs_break_dax_layouts(VFS_I(ip1), &retry); - if (error || retry) { + error = xfs_break_dax_layouts(VFS_I(ip1)); + if (error) { xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); - if (error == 0 && retry) - goto again; return error; } @@ -2732,8 +2757,8 @@ again: * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable * for this nested lock case. */ - page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); - if (page && page_ref_count(page) != 1) { + error = dax_break_layout(VFS_I(ip2), 0, -1, NULL); + if (error) { xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); goto again; @@ -2977,21 +3002,11 @@ xfs_wait_dax_page( int xfs_break_dax_layouts( - struct inode *inode, - bool *retry) + struct inode *inode) { - struct page *page; - xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, xfs_wait_dax_page(inode)); + return dax_break_layout_inode(inode, xfs_wait_dax_page); } int @@ -3009,8 +3024,8 @@ xfs_break_layouts( retry = false; switch (reason) { case BREAK_UNMAP: - error = xfs_break_dax_layouts(inode, &retry); - if (error || retry) + error = xfs_break_dax_layouts(inode); + if (error) break; fallthrough; case BREAK_WRITE: @@ -3041,7 +3056,8 @@ xfs_inode_alloc_unitsize( /* Should we always be using copy on write for file writes? */ bool xfs_is_always_cow_inode( - struct xfs_inode *ip) + const struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); + return xfs_is_zoned_inode(ip) || + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97ed912306fd..d7e2b902ef5c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -59,12 +59,18 @@ typedef struct xfs_inode { xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ + /* + * i_used_blocks is used for zoned rtrmap inodes, + * i_cowextsize is used for other v3 inodes, + * i_flushiter for v1/2 inodes + */ union { + uint32_t i_used_blocks; /* used blocks in RTG */ xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; uint8_t i_forkoff; /* attr fork offset >> 3 */ + enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */ uint16_t i_diflags; /* XFS_DIFLAG_... */ uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ @@ -100,7 +106,7 @@ static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) return ip->i_prev_unlinked != 0; } -static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) +static inline bool xfs_inode_has_attr_fork(const struct xfs_inode *ip) { return ip->i_forkoff > 0; } @@ -230,7 +236,7 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) } static inline int -__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) +__xfs_iflags_test(const struct xfs_inode *ip, unsigned long flags) { return (ip->i_flags & flags); } @@ -271,42 +277,65 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) return ret; } -static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; } -static inline bool xfs_is_metadata_inode(const struct xfs_inode *ip) +static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_METADATA; +} + +static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; + /* Any file in the metadata directory tree is a metadata inode. */ + if (xfs_has_metadir(mp)) + return xfs_is_metadir_inode(ip); + + /* + * Before metadata directories, the only metadata inodes were the + * three quota files, the realtime bitmap, and the realtime summary. + */ return ip->i_ino == mp->m_sb.sb_rbmino || ip->i_ino == mp->m_sb.sb_rsumino || xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } -bool xfs_is_always_cow_inode(struct xfs_inode *ip); +static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) +{ + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); +} + +bool xfs_is_always_cow_inode(const struct xfs_inode *ip); -static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) { return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } +static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) +{ + return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; +} + /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. */ -static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) +static inline bool xfs_inode_has_cow_data(const struct xfs_inode *ip) { return ip->i_cowfp && ip->i_cowfp->if_bytes; } -static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) +static inline bool xfs_inode_has_bigtime(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } -static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) +static inline bool xfs_inode_has_large_extent_counts(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; } @@ -315,7 +344,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) * Decide if this file is a realtime file whose data allocation unit is larger * than a single filesystem block. */ -static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) +static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) { return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; } @@ -327,6 +356,11 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) +{ + return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0; +} + /* * In-core inode flags. */ @@ -429,9 +463,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly * limited to the subclasses we can represent via nesting. We need at least * 5 inodes nest depth for the ILOCK through rename, and we also have to support - * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP - * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all - * 8 subclasses supported by lockdep. + * XFS_ILOCK_PARENT, which gives 6 subclasses. That's 6 of the 8 subclasses + * supported by lockdep. * * This also means we have to number the sub-classes in the lowest bits of * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep @@ -457,8 +490,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) * ILOCK values * 0-4 subclass values * 5 PARENT subclass (not nestable) - * 6 RTBITMAP subclass (not nestable) - * 7 RTSUM subclass (not nestable) + * 6 unused + * 7 unused * */ #define XFS_IOLOCK_SHIFT 16 @@ -473,12 +506,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) -#define XFS_ILOCK_RTBITMAP_VAL 6u -#define XFS_ILOCK_RTSUM_VAL 7u #define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) #define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ @@ -564,7 +593,7 @@ xfs_itruncate_extents( return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); } -int xfs_break_dax_layouts(struct inode *inode, bool *retry); +int xfs_break_dax_layouts(struct inode *inode); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); @@ -620,9 +649,9 @@ void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); static inline bool xfs_inode_unlinked_incomplete( - struct xfs_inode *ip) + const struct xfs_inode *ip) { - return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); + return VFS_IC(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); } int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b509cbd191f4..c6cb0b6b9e46 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -157,6 +157,20 @@ xfs_inode_item_precommit( if (flags & XFS_ILOG_IVERSION) flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + /* + * Inode verifiers do not check that the CoW extent size hint is an + * integer multiple of the rt extent size on a directory with both + * rtinherit and cowextsize flags set. If we're logging a directory + * that is misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } + if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -185,7 +199,7 @@ xfs_inode_item_precommit( xfs_buf_hold(bp); spin_lock(&iip->ili_lock); iip->ili_item.li_buf = bp; - bp->b_flags |= _XBF_INODES; + bp->b_iodone = xfs_buf_inode_iodone; list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); xfs_trans_brelse(tp, bp); } @@ -242,6 +256,7 @@ xfs_inode_item_data_fork_size( } break; case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { *nbytes += ip->i_df.if_broot_bytes; @@ -362,6 +377,7 @@ xfs_inode_item_format_data_fork( } break; case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV); @@ -556,7 +572,6 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode)); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); @@ -581,6 +596,7 @@ xfs_inode_to_log_dinode( to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); to->di_flags2 = ip->i_diflags2; + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; @@ -590,10 +606,16 @@ xfs_inode_to_log_dinode( /* dummy value for initialisation */ to->di_crc = 0; + + if (xfs_is_metadir_inode(ip)) + to->di_metatype = ip->i_metatype; + else + to->di_metatype = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); + to->di_metatype = 0; } xfs_inode_to_log_dinode_iext_counters(ip, to); @@ -1018,18 +1040,6 @@ xfs_buf_inode_iodone( list_splice_tail(&flushed_inodes, &bp->b_li_list); } -void -xfs_buf_inode_io_fail( - struct xfs_buf *bp) -{ - struct xfs_log_item *lip; - - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - set_bit(XFS_LI_FAILED, &lip->li_flags); - clear_bit(XFS_LI_FLUSHING, &lip->li_flags); - } -} - /* * Clear the inode logging fields so no more flushes are attempted. If we are * on a buffer list, it is now safe to remove it because the buffer is @@ -1079,13 +1089,7 @@ xfs_iflush_abort( * state. Whilst the inode is in the AIL, it should have a valid buffer * pointer for push operations to access - it is only safe to remove the * inode from the buffer once it has been removed from the AIL. - * - * We also clear the failed bit before removing the item from the AIL - * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer - * references the inode item owns and needs to hold until we've fully - * aborted the inode log item and detached it from the buffer. */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); xfs_trans_ail_delete(&iip->ili_item, 0); /* diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index dbdab4ce7c44..7205fd14f6b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -22,6 +22,8 @@ #include "xfs_log_recover.h" #include "xfs_icache.h" #include "xfs_bmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" STATIC void xlog_recover_inode_ra_pass2( @@ -175,7 +177,7 @@ xfs_log_dinode_to_disk( to->di_mode = cpu_to_be16(from->di_mode); to->di_version = from->di_version; to->di_format = from->di_format; - to->di_onlink = 0; + to->di_metatype = cpu_to_be16(from->di_metatype); to->di_uid = cpu_to_be32(from->di_uid); to->di_gid = cpu_to_be32(from->di_gid); to->di_nlink = cpu_to_be32(from->di_nlink); @@ -201,6 +203,7 @@ xfs_log_dinode_to_disk( to->di_crtime = xfs_log_dinode_to_disk_ts(from, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); @@ -266,6 +269,41 @@ xlog_dinode_verify_extent_counts( return 0; } +static inline int +xlog_recover_inode_dbroot( + struct xfs_mount *mp, + void *src, + unsigned int len, + struct xfs_dinode *dip) +{ + void *dfork = XFS_DFORK_DPTR(dip); + unsigned int dsize = XFS_DFORK_DSIZE(dip, mp); + + switch (dip->di_format) { + case XFS_DINODE_FMT_BTREE: + xfs_bmbt_to_bmdr(mp, src, len, dfork, dsize); + break; + case XFS_DINODE_FMT_META_BTREE: + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize); + return 0; + case XFS_METAFILE_RTREFCOUNT: + xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize); + return 0; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; +} + STATIC int xlog_recover_inode_commit_pass2( struct xlog *log, @@ -393,8 +431,9 @@ xlog_recover_inode_commit_pass2( if (unlikely(S_ISREG(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE)) { + if (ldip->di_format != XFS_DINODE_FMT_EXTENTS && + ldip->di_format != XFS_DINODE_FMT_BTREE && + ldip->di_format != XFS_DINODE_FMT_META_BTREE) { XFS_CORRUPTION_ERROR( "Bad log dinode data fork format for regular file", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); @@ -475,9 +514,9 @@ xlog_recover_inode_commit_pass2( break; case XFS_ILOG_DBROOT: - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, - (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), - XFS_DFORK_DSIZE(dip, mp)); + error = xlog_recover_inode_dbroot(mp, src, len, dip); + if (error) + goto out_release; break; default: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a20d426ef021..d250f7f74e3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -40,6 +40,7 @@ #include "xfs_file.h" #include "xfs_exchrange.h" #include "xfs_handle.h" +#include "xfs_rtgroup.h" #include <linux/mount.h> #include <linux/fileattr.h> @@ -233,6 +234,10 @@ xfs_bulk_ireq_setup( if (hdr->flags & XFS_BULK_IREQ_NREXT64) breq->flags |= XFS_IBULK_NREXT64; + /* Caller wants to see metadata directories in bulkstat output. */ + if (hdr->flags & XFS_BULK_IREQ_METADIR) + breq->flags |= XFS_IBULK_METADIR; + return 0; } @@ -323,6 +328,9 @@ xfs_ioc_inumbers( if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) return -EFAULT; + if (hdr.flags & XFS_BULK_IREQ_METADIR) + return -EINVAL; + error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); if (error == -ECANCELED) goto out_teardown; @@ -396,6 +404,38 @@ xfs_ioc_ag_geometry( return 0; } +STATIC int +xfs_ioc_rtgroup_geometry( + struct xfs_mount *mp, + void __user *arg) +{ + struct xfs_rtgroup *rtg; + struct xfs_rtgroup_geometry rgeo; + int error; + + if (copy_from_user(&rgeo, arg, sizeof(rgeo))) + return -EFAULT; + if (rgeo.rg_flags) + return -EINVAL; + if (memchr_inv(&rgeo.rg_reserved, 0, sizeof(rgeo.rg_reserved))) + return -EINVAL; + if (!xfs_has_rtgroups(mp)) + return -EINVAL; + + rtg = xfs_rtgroup_get(mp, rgeo.rg_number); + if (!rtg) + return -EINVAL; + + error = xfs_rtgroup_get_geometry(rtg, &rgeo); + xfs_rtgroup_put(rtg); + if (error) + return error; + + if (copy_to_user(arg, &rgeo, sizeof(rgeo))) + return -EFAULT; + return 0; +} + /* * Linux extended inode flags interface. */ @@ -429,8 +469,21 @@ xfs_fill_fsxattr( } } - if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) - fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { + /* + * Don't let a misaligned CoW extent size hint on a directory + * escape to userspace if it won't pass the setattr checks + * later. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) { + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + fa->fsx_cowextsize = 0; + } else { + fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); + } + } + fa->fsx_projid = ip->i_projid; if (ifp && !xfs_need_iread_extents(ifp)) fa->fsx_nextents = xfs_iext_count(ifp); @@ -481,7 +534,7 @@ xfs_ioctl_setattr_xflags( if (rtflag != XFS_IS_REALTIME_INODE(ip)) { /* Can't change realtime flag if any extents are allocated. */ - if (ip->i_df.if_nextents || ip->i_delayed_blks) + if (xfs_inode_has_filedata(ip)) return -EINVAL; /* @@ -501,10 +554,6 @@ xfs_ioctl_setattr_xflags( if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; - - /* Clear reflink if we are actually able to set the rt flag. */ - if (xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; } /* diflags2 only valid for v3 inodes. */ @@ -602,7 +651,7 @@ xfs_ioctl_setattr_check_extsize( if (!fa->fsx_valid) return 0; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) && XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; @@ -881,41 +930,29 @@ xfs_ioc_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct fd f, tmp; - int error = 0; /* Pull information for the target fd */ - f = fdget((int)sxp->sx_fdtarget); - if (!fd_file(f)) { - error = -EINVAL; - goto out; - } + CLASS(fd, f)((int)sxp->sx_fdtarget); + if (fd_empty(f)) + return -EINVAL; if (!(fd_file(f)->f_mode & FMODE_WRITE) || !(fd_file(f)->f_mode & FMODE_READ) || - (fd_file(f)->f_flags & O_APPEND)) { - error = -EBADF; - goto out_put_file; - } + (fd_file(f)->f_flags & O_APPEND)) + return -EBADF; - tmp = fdget((int)sxp->sx_fdtmp); - if (!fd_file(tmp)) { - error = -EINVAL; - goto out_put_file; - } + CLASS(fd, tmp)((int)sxp->sx_fdtmp); + if (fd_empty(tmp)) + return -EINVAL; if (!(fd_file(tmp)->f_mode & FMODE_WRITE) || !(fd_file(tmp)->f_mode & FMODE_READ) || - (fd_file(tmp)->f_flags & O_APPEND)) { - error = -EBADF; - goto out_put_tmp_file; - } + (fd_file(tmp)->f_flags & O_APPEND)) + return -EBADF; if (IS_SWAPFILE(file_inode(fd_file(f))) || - IS_SWAPFILE(file_inode(fd_file(tmp)))) { - error = -EINVAL; - goto out_put_tmp_file; - } + IS_SWAPFILE(file_inode(fd_file(tmp)))) + return -EINVAL; /* * We need to ensure that the fds passed in point to XFS inodes @@ -923,37 +960,22 @@ xfs_ioc_swapext( * control over what the user passes us here. */ if (fd_file(f)->f_op != &xfs_file_operations || - fd_file(tmp)->f_op != &xfs_file_operations) { - error = -EINVAL; - goto out_put_tmp_file; - } + fd_file(tmp)->f_op != &xfs_file_operations) + return -EINVAL; ip = XFS_I(file_inode(fd_file(f))); tip = XFS_I(file_inode(fd_file(tmp))); - if (ip->i_mount != tip->i_mount) { - error = -EINVAL; - goto out_put_tmp_file; - } - - if (ip->i_ino == tip->i_ino) { - error = -EINVAL; - goto out_put_tmp_file; - } + if (ip->i_mount != tip->i_mount) + return -EINVAL; - if (xfs_is_shutdown(ip->i_mount)) { - error = -EIO; - goto out_put_tmp_file; - } + if (ip->i_ino == tip->i_ino) + return -EINVAL; - error = xfs_swap_extents(ip, tip, sxp); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; - out_put_tmp_file: - fdput(tmp); - out_put_file: - fdput(f); - out: - return error; + return xfs_swap_extents(ip, tip, sxp); } static int @@ -1021,7 +1043,7 @@ xfs_ioc_setlabel( * buffered reads from userspace (i.e. from blkid) are invalidated, * and userspace will see the newly-written label. */ - error = xfs_sync_sb_buf(mp); + error = xfs_sync_sb_buf(mp, true); if (error) goto out; /* @@ -1032,6 +1054,8 @@ xfs_ioc_setlabel( mutex_unlock(&mp->m_growlock); invalidate_bdev(mp->m_ddev_targp->bt_bdev); + if (xfs_has_rtsb(mp) && mp->m_rtdev_targp) + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); out: mnt_drop_write_file(filp); @@ -1107,15 +1131,15 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_reserve_blocks(mp, fsop.resblks); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; } spin_lock(&mp->m_sb_lock); - fsop.resblks = mp->m_resblks; - fsop.resblks_avail = mp->m_resblks_avail; + fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; + fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; spin_unlock(&mp->m_sb_lock); if (copy_to_user(arg, &fsop, sizeof(fsop))) @@ -1131,9 +1155,9 @@ xfs_ioctl_fs_counts( struct xfs_fsop_counts out = { .allocino = percpu_counter_read_positive(&mp->m_icount), .freeino = percpu_counter_read_positive(&mp->m_ifree), - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp), - .freertx = percpu_counter_read_positive(&mp->m_frextents), + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), }; if (copy_to_user(uarg, &out, sizeof(out))) @@ -1189,7 +1213,16 @@ xfs_file_ioctl( struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; - da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = target->bt_logical_sectorsize; + + /* + * See xfs_report_dioalign() for an explanation about why this + * reports a value larger than the sector size for COW inodes. + */ + if (xfs_is_cow_inode(ip)) + da.d_miniosz = xfs_inode_alloc_unitsize(ip); + else + da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) @@ -1216,6 +1249,8 @@ xfs_file_ioctl( case XFS_IOC_AG_GEOMETRY: return xfs_ioc_ag_geometry(mp, arg); + case XFS_IOC_RTGROUP_GEOMETRY: + return xfs_ioc_rtgroup_geometry(mp, arg); case XFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 916531d9f83c..ff05e6b1b0bb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -24,11 +24,14 @@ #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_quota.h" +#include "xfs_rtgroup.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" +#include "xfs_zone_alloc.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -115,7 +118,9 @@ xfs_bmbt_to_iomap( iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { - iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); + xfs_daddr_t daddr = xfs_fsb_to_db(ip, imap->br_startblock); + + iomap->addr = BBTOB(daddr); if (mapping_flags & IOMAP_DAX) iomap->addr += target->bt_dax_part_off; @@ -124,6 +129,14 @@ xfs_bmbt_to_iomap( else iomap->type = IOMAP_MAPPED; + /* + * Mark iomaps starting at the first sector of a RTG as merge + * boundary so that each I/O completions is contained to a + * single RTG. + */ + if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(mp) && + xfs_rtbno_is_group_start(mp, imap->br_startblock)) + iomap->flags |= IOMAP_F_BOUNDARY; } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); @@ -342,16 +355,26 @@ xfs_quota_need_throttle( xfs_fsblock_t alloc_blocks) { struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot_res *res; + struct xfs_dquot_pre *pre; if (!dq || !xfs_this_quota_on(ip->i_mount, type)) return false; + if (XFS_IS_REALTIME_INODE(ip)) { + res = &dq->q_rtb; + pre = &dq->q_rtb_prealloc; + } else { + res = &dq->q_blk; + pre = &dq->q_blk_prealloc; + } + /* no hi watermark, no throttle */ - if (!dq->q_prealloc_hi_wmark) + if (!pre->q_prealloc_hi_wmark) return false; /* under the lo watermark, no throttle */ - if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) + if (res->reserved + alloc_blocks < pre->q_prealloc_lo_wmark) return false; return true; @@ -366,22 +389,35 @@ xfs_quota_calc_throttle( int64_t *qfreesp) { struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot_res *res; + struct xfs_dquot_pre *pre; int64_t freesp; int shift = 0; + if (!dq) { + res = NULL; + pre = NULL; + } else if (XFS_IS_REALTIME_INODE(ip)) { + res = &dq->q_rtb; + pre = &dq->q_rtb_prealloc; + } else { + res = &dq->q_blk; + pre = &dq->q_blk_prealloc; + } + /* no dq, or over hi wmark, squash the prealloc completely */ - if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { + if (!res || res->reserved >= pre->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; } - freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; - if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + freesp = pre->q_prealloc_hi_wmark - res->reserved; + if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT]) { shift = 2; - if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + if (freesp < pre->q_low_space[XFS_QLOWSP_3_PCNT]) shift += 2; - if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + if (freesp < pre->q_low_space[XFS_QLOWSP_1_PCNT]) shift += 2; } @@ -397,13 +433,14 @@ xfs_quota_calc_throttle( static int64_t xfs_iomap_freesp( - struct percpu_counter *counter, + struct xfs_mount *mp, + unsigned int idx, uint64_t low_space[XFS_LOWSP_MAX], int *shift) { int64_t freesp; - freesp = percpu_counter_read_positive(counter); + freesp = xfs_estimate_freecounter(mp, idx); if (freesp < low_space[XFS_LOWSP_5_PCNT]) { *shift = 2; if (freesp < low_space[XFS_LOWSP_4_PCNT]) @@ -501,11 +538,11 @@ xfs_iomap_prealloc_size( alloc_blocks); if (unlikely(XFS_IS_REALTIME_INODE(ip))) - freesp = xfs_rtx_to_rtb(mp, - xfs_iomap_freesp(&mp->m_frextents, + freesp = xfs_rtbxlen_to_blen(mp, + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts, &shift)); else - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, &shift); /* @@ -707,7 +744,7 @@ imap_needs_cow( return false; /* when zeroing we don't have to COW holes or unwritten extents */ - if (flags & IOMAP_ZERO) { + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { if (!nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_state == XFS_EXT_UNWRITTEN) @@ -761,6 +798,38 @@ imap_spans_range( return true; } +static bool +xfs_bmap_hw_atomic_write_possible( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb); + + /* + * atomic writes are required to be naturally aligned for disk blocks, + * which ensures that we adhere to block layer rules that we won't + * straddle any boundary or violate write alignment requirement. + */ + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount)) + return false; + + /* + * Spanning multiple extents would mean that multiple BIOs would be + * issued, and so would lose atomicity required for REQ_ATOMIC-based + * atomics. + */ + if (!imap_spans_range(imap, offset_fsb, end_fsb)) + return false; + + /* + * The ->iomap_begin caller should ensure this, but check anyway. + */ + return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max; +} + static int xfs_direct_write_iomap_begin( struct inode *inode, @@ -775,9 +844,11 @@ xfs_direct_write_iomap_begin( struct xfs_bmbt_irec imap, cmap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_fileoff_t orig_end_fsb = end_fsb; int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; + bool needs_alloc; unsigned int lockmode; u64 seq; @@ -794,6 +865,10 @@ xfs_direct_write_iomap_begin( if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; + /* HW-offload atomics are always used in this path */ + if (flags & IOMAP_ATOMIC) + iomap_flags |= IOMAP_F_ATOMIC_BIO; + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. @@ -834,13 +909,37 @@ relock: (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; - if (shared) + if (shared) { + if ((flags & IOMAP_ATOMIC) && + !xfs_bmap_hw_atomic_write_possible(ip, &cmap, + offset_fsb, end_fsb)) { + error = -ENOPROTOOPT; + goto out_unlock; + } goto out_found_cow; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if (imap_needs_alloc(inode, flags, &imap, nimaps)) + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps); + + if (flags & IOMAP_ATOMIC) { + error = -ENOPROTOOPT; + /* + * If we allocate less than what is required for the write + * then we may end up with multiple extents, which means that + * REQ_ATOMIC-based cannot be used, so avoid this possibility. + */ + if (needs_alloc && orig_end_fsb - offset_fsb > 1) + goto out_unlock; + + if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb, + orig_end_fsb)) + goto out_unlock; + } + + if (needs_alloc) goto allocate_blocks; /* @@ -928,6 +1027,187 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +#ifdef CONFIG_XFS_RT +/* + * This is really simple. The space has already been reserved before taking the + * IOLOCK, the actual block allocation is done just before submitting the bio + * and only recorded in the extent map on I/O completion. + */ +static int +xfs_zoned_direct_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); + + /* + * Needs to be pushed down into the allocator so that only writes into + * a single zone can be supported. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + /* + * Ensure the extent list is in memory in so that we don't have to do + * read it from the I/O completion handler. + */ + if (xfs_need_iread_extents(&ip->i_df)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + } + + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DIRTY; + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; + iomap->offset = offset; + iomap->length = length; + iomap->flags = IOMAP_F_ANON_WRITE; + return 0; +} + +const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { + .iomap_begin = xfs_zoned_direct_write_iomap_begin, +}; +#endif /* CONFIG_XFS_RT */ + +static int +xfs_atomic_write_cow_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_filblks_t count_fsb = end_fsb - offset_fsb; + int nmaps = 1; + xfs_filblks_t resaligned; + struct xfs_bmbt_irec cmap; + struct xfs_iext_cursor icur; + struct xfs_trans *tp; + unsigned int dblocks = 0, rblocks = 0; + int error; + u64 seq; + + ASSERT(flags & IOMAP_WRITE); + ASSERT(flags & IOMAP_DIRECT); + + if (xfs_is_shutdown(mp)) + return -EIO; + + if (!xfs_can_sw_atomic_write(mp)) { + ASSERT(xfs_can_sw_atomic_write(mp)); + return -EINVAL; + } + + /* blocks are always allocated in this path */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + trace_xfs_iomap_atomic_write_cow(ip, offset, length); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + goto found; + } + + end_fsb = cmap.br_startoff; + count_fsb = end_fsb - offset_fsb; + + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + xfs_get_cowextsz_hint(ip)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); + if (error) + return error; + + /* extent layout could have changed since the unlock, so check again */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + xfs_trans_cancel(tp); + goto found; + } + + /* + * Allocate the entire reservation as unwritten blocks. + * + * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to + * extszhint, such that there will be a greater chance that future + * atomic writes to that same range will be aligned (and don't require + * this COW-based method). + */ + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; + +found: + if (cmap.br_state != XFS_EXT_NORM) { + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, + count_fsb); + if (error) + goto out_unlock; + cmap.br_state = XFS_EXT_NORM; + } + + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = { + .iomap_begin = xfs_atomic_write_cow_iomap_begin, +}; + static int xfs_dax_write_iomap_end( struct inode *inode, @@ -942,10 +1222,8 @@ xfs_dax_write_iomap_end( if (!xfs_is_cow_inode(ip)) return 0; - if (!written) { - xfs_reflink_cancel_cow_range(ip, pos, length, true); - return 0; - } + if (!written) + return xfs_reflink_cancel_cow_range(ip, pos, length, true); return xfs_reflink_end_cow(ip, pos, written); } @@ -955,6 +1233,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = { .iomap_end = xfs_dax_write_iomap_end, }; +/* + * Convert a hole to a delayed allocation. + */ +static void +xfs_bmap_add_extent_hole_delay( + struct xfs_inode *ip, /* incore inode pointer */ + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *new) /* new data to add to file extents */ +{ + struct xfs_ifork *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + uint32_t state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ + + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { + state |= BMAP_LEFT_VALID; + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (xfs_iext_get_extent(ifp, icur, &right)) { + state |= BMAP_RIGHT_VALID; + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; + + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, icur, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + + /* + * Nothing to do for disk quota accounting here. + */ + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); + } +} + +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ +static int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *icur, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + xfs_extlen_t alen; + xfs_extlen_t indlen; + uint64_t fdblocks; + int error; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; + +retry: + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + aoff = off; + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; + + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { + struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); + + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_quota_reserve_blkres(ip, alen); + if (error) + goto out; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + ASSERT(!xfs_is_zoned_inode(ip)); + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } + + error = xfs_dec_fdblocks(mp, fdblocks, false); + if (error) + goto out_unreserve_frextents; + + ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip, alen, indlen); + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); + + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + + return 0; + +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } + return error; +} + +static int +xfs_zoned_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + struct xfs_zone_alloc_ctx *ac = iter->private; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + u16 iomap_flags = IOMAP_F_SHARED; + unsigned int lockmode = XFS_ILOCK_EXCL; + xfs_filblks_t count_fsb; + xfs_extlen_t indlen; + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + int error = 0; + + ASSERT(!xfs_get_extsz_hint(ip)); + ASSERT(!(flags & IOMAP_UNSHARE)); + ASSERT(ac); + + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip); + if (error) + return error; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* + * For zeroing operations check if there is any data to zero first. + * + * For regular writes we always need to allocate new blocks, but need to + * provide the source mapping when the range is unaligned to support + * read-modify-write of the whole block in the page cache. + * + * In either case we need to limit the reported range to the boundaries + * of the source map in the data fork. + */ + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || + (flags & IOMAP_ZERO)) { + struct xfs_bmbt_irec smap; + struct xfs_iext_cursor scur; + + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, + &smap)) + smap.br_startoff = end_fsb; /* fake hole until EOF */ + if (smap.br_startoff > offset_fsb) { + /* + * We never need to allocate blocks for zeroing a hole. + */ + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, + smap.br_startoff); + goto out_unlock; + } + end_fsb = min(end_fsb, smap.br_startoff); + } else { + end_fsb = min(end_fsb, + smap.br_startoff + smap.br_blockcount); + xfs_trim_extent(&smap, offset_fsb, + end_fsb - offset_fsb); + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); + if (error) + goto out_unlock; + } + } + + if (!ip->i_cowfp) + xfs_ifork_init_cow(ip); + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = end_fsb; + if (got.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* + * Cap the maximum length to keep the chunks of work done here somewhat + * symmetric with the work writeback does. + */ + end_fsb = min(end_fsb, got.br_startoff); + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); + + /* + * The block reservation is supposed to cover all blocks that the + * operation could possible write, but there is a nasty corner case + * where blocks could be stolen from underneath us: + * + * 1) while this thread iterates over a larger buffered write, + * 2) another thread is causing a write fault that calls into + * ->page_mkwrite in range this thread writes to, using up the + * delalloc reservation created by a previous call to this function. + * 3) another thread does direct I/O on the range that the write fault + * happened on, which causes writeback of the dirty data. + * 4) this then set the stale flag, which cuts the current iomap + * iteration short, causing the new call to ->iomap_begin that gets + * us here again, but now without a sufficient reservation. + * + * This is a very unusual I/O pattern, and nothing but generic/095 is + * known to hit it. There's not really much we can do here, so turn this + * into a short write. + */ + if (count_fsb > ac->reserved_blocks) { + xfs_warn_ratelimited(mp, +"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", + ip->i_ino, current->comm); + count_fsb = ac->reserved_blocks; + if (!count_fsb) { + error = -EIO; + goto out_unlock; + } + } + + error = xfs_quota_reserve_blkres(ip, count_fsb); + if (error) + goto out_unlock; + + indlen = xfs_bmap_worst_indlen(ip, count_fsb); + error = xfs_dec_fdblocks(mp, indlen, false); + if (error) + goto out_unlock; + ip->i_delayed_blks += count_fsb; + xfs_mod_delalloc(ip, count_fsb, indlen); + + got.br_startoff = offset_fsb; + got.br_startblock = nullstartblock(indlen); + got.br_blockcount = count_fsb; + got.br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); + ac->reserved_blocks -= count_fsb; + iomap_flags |= IOMAP_F_NEW; + + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), + XFS_COW_FORK, &got); +done: + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + static int xfs_buffered_write_iomap_begin( struct inode *inode, @@ -981,6 +1708,10 @@ xfs_buffered_write_iomap_begin( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_is_zoned_inode(ip)) + return xfs_zoned_buffered_write_iomap_begin(inode, offset, + count, flags, iomap, srcmap); + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, @@ -1213,10 +1944,13 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + xfs_bmap_punch_delalloc_range(XFS_I(inode), (iomap->flags & IOMAP_F_SHARED) ? XFS_COW_FORK : XFS_DATA_FORK, - offset, offset + length); + offset, offset + length, iter->private); } static int @@ -1234,6 +1968,14 @@ xfs_buffered_write_iomap_end( if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) return 0; + /* + * iomap_page_mkwrite() will never fail in a way that requires delalloc + * extents that it allocated to be revoked. Hence never try to release + * them here. + */ + if (flags & IOMAP_FAULT) + return 0; + /* Nothing to do if we've written the entire delalloc extent */ start_byte = iomap_last_written_block(inode, offset, written); end_byte = round_up(offset + length, i_blocksize(inode)); @@ -1260,15 +2002,6 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; -/* - * iomap_page_mkwrite() will never fail in a way that requires delalloc extents - * that it allocated to be revoked. Hence we do not need an .iomap_end method - * for this operation. - */ -const struct iomap_ops xfs_page_mkwrite_iomap_ops = { - .iomap_begin = xfs_buffered_write_iomap_begin, -}; - static int xfs_read_iomap_begin( struct inode *inode, @@ -1454,6 +2187,7 @@ xfs_zero_range( struct xfs_inode *ip, loff_t pos, loff_t len, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1464,13 +2198,14 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } int xfs_truncate_page( struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1479,5 +2214,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 4da13440bae9..674f8ac1b9bd 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -10,6 +10,7 @@ struct xfs_inode; struct xfs_bmbt_irec; +struct xfs_zone_alloc_ctx; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, @@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, - bool *did_zero); -int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); + struct xfs_zone_alloc_ctx *ac, bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -48,11 +50,12 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; -extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops; +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ee79cf161312..8cddbb7c149b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_zone_alloc.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -42,7 +43,9 @@ * held. For regular files, the lock order is the other way around - the * mmap_lock is taken during the page fault, and then we lock the ilock to do * block mapping. Hence we need a different class for the directory ilock so - * that lockdep can tell them apart. + * that lockdep can tell them apart. Directories in the metadata directory + * tree get a separate class so that lockdep reports will warn us if someone + * ever tries to lock regular directories after locking metadata directories. */ static struct lock_class_key xfs_nondir_ilock_class; static struct lock_class_key xfs_dir_ilock_class; @@ -296,14 +299,14 @@ xfs_vn_create( return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } -STATIC int +STATIC struct dentry * xfs_vn_mkdir( struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); + return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL)); } STATIC struct dentry * @@ -570,6 +573,112 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_report_dioalign( + struct xfs_inode *ip, + struct kstat *stat) +{ + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + + /* + * For COW inodes, we can only perform out of place writes of entire + * allocation units (blocks or RT extents). + * For writes smaller than the allocation unit, we must fall back to + * buffered I/O to perform read-modify-write cycles. At best this is + * highly inefficient; at worst it leads to page cache invalidation + * races. Tell applications to avoid this by reporting the larger write + * alignment in dio_offset_align, and the smaller read alignment in + * dio_read_offset_align. + */ + stat->dio_read_offset_align = bdev_logical_block_size(bdev); + if (xfs_is_cow_inode(ip)) + stat->dio_offset_align = xfs_inode_alloc_unitsize(ip); + else + stat->dio_offset_align = stat->dio_read_offset_align; +} + +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + return mp->m_sb.sb_blocksize; + + return 0; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_can_sw_atomic_write(mp)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max); + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max); +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) +{ + unsigned int awu_max = xfs_get_atomic_write_max(ip); + + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max); +} + +static void +xfs_report_atomic_write( + struct xfs_inode *ip, + struct kstat *stat) +{ + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -597,8 +706,9 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -608,11 +718,6 @@ xfs_vn_getattr( } } - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->change_cookie = inode_query_iversion(inode); - stat->result_mask |= STATX_CHANGE_COOKIE; - } - /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -635,14 +740,10 @@ xfs_vn_getattr( stat->rdev = inode->i_rdev; break; case S_IFREG: - if (request_mask & STATX_DIOALIGN) { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - struct block_device *bdev = target->bt_bdev; - - stat->result_mask |= STATX_DIOALIGN; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - } + if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) + xfs_report_dioalign(ip, stat); + if (request_mask & STATX_WRITE_ATOMIC) + xfs_report_atomic_write(ip, stat); fallthrough; default: stat->blksize = xfs_stat_blksize(ip); @@ -820,6 +921,7 @@ xfs_setattr_size( uint lock_flags = 0; uint resblks = 0; bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); @@ -856,6 +958,28 @@ xfs_setattr_size( inode_dio_wait(inode); /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(ip, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a @@ -868,11 +992,14 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, - &did_zeroing); + &ac, &did_zeroing); } else { - error = xfs_truncate_page(ip, newsize, &did_zeroing); + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(ip, &ac); + if (error) return error; @@ -1289,6 +1416,7 @@ xfs_setup_inode( { struct inode *inode = &ip->i_vnode; gfp_t gfp_mask; + bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; inode->i_state |= I_NEW; @@ -1300,6 +1428,16 @@ xfs_setup_inode( i_size_write(inode, ip->i_disk_size); xfs_diflags_to_iflags(ip, true); + /* + * Mark our metadata files as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files, + * and users cannot ever obtain file handles to them. + */ + if (is_meta) { + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } + if (S_ISDIR(inode->i_mode)) { /* * We set the i_rwsem class here to avoid potential races with diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 3c1a2605ffd2..0896f6b8b3b8 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir, extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c0757ab99495..1fa1c0564b0c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -36,6 +36,14 @@ struct xfs_bstat_chunk { struct xfs_bulkstat *buf; }; +static inline bool +want_metadir_file( + struct xfs_inode *ip, + struct xfs_ibulk *breq) +{ + return xfs_is_metadir_inode(ip) && (breq->flags & XFS_IBULK_METADIR); +} + /* * Fill out the bulkstat info for a single inode and report it somewhere. * @@ -69,9 +77,6 @@ xfs_bulkstat_one_int( vfsuid_t vfsuid; vfsgid_t vfsgid; - if (xfs_internal_inum(mp, ino)) - goto out_advance; - error = xfs_iget(mp, tp, ino, (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), XFS_ILOCK_SHARED, &ip); @@ -97,8 +102,28 @@ xfs_bulkstat_one_int( vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + /* + * If caller wants files from the metadata directories, push out the + * bare minimum information for enabling scrub. + */ + if (want_metadir_file(ip, bc->breq)) { + memset(buf, 0, sizeof(*buf)); + buf->bs_ino = ino; + buf->bs_gen = inode->i_generation; + buf->bs_mode = inode->i_mode & S_IFMT; + xfs_bulkstat_health(ip, buf); + buf->bs_version = XFS_BULKSTAT_VERSION_V5; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_irele(ip); + + error = bc->formatter(bc->breq, buf); + if (!error || error == -ECANCELED) + goto out_advance; + goto out; + } + /* If this is a private inode, don't leak its details to userspace. */ - if (IS_PRIVATE(inode)) { + if (IS_PRIVATE(inode) || xfs_is_sb_inum(mp, ino)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_irele(ip); error = -EINVAL; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 1659f13f17a8..f10e8f8f2335 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -22,6 +22,9 @@ struct xfs_ibulk { /* Fill out the bs_extents64 field if set. */ #define XFS_IBULK_NREXT64 (1U << 1) +/* Signal that we can return metadata directories. */ +#define XFS_IBULK_METADIR (1U << 2) + /* * Advance the user buffer pointer by one record of the given size. If the * buffer is now full, return the appropriate error code. diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c index 2ddccb172fa0..1fd70a7aed63 100644 --- a/fs/xfs/xfs_iunlink_item.c +++ b/fs/xfs/xfs_iunlink_item.c @@ -52,14 +52,14 @@ xfs_iunlink_log_dinode( struct xfs_trans *tp, struct xfs_iunlink_item *iup) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *ip = iup->ip; struct xfs_dinode *dip; struct xfs_buf *ibp; + xfs_agino_t old_ptr; int offset; int error; - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); + error = xfs_imap_to_bp(tp->t_mountp, tp, &ip->i_imap, &ibp); if (error) return error; /* @@ -73,22 +73,21 @@ xfs_iunlink_log_dinode( dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); /* Make sure the old pointer isn't garbage. */ - if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) { + old_ptr = be32_to_cpu(dip->di_next_unlinked); + if (old_ptr != iup->old_agino) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; goto out; } - trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno, - XFS_INO_TO_AGINO(mp, ip->i_ino), - be32_to_cpu(dip->di_next_unlinked), iup->next_agino); + trace_xfs_iunlink_update_dinode(iup, old_ptr); dip->di_next_unlinked = cpu_to_be32(iup->next_agino); offset = ip->i_imap.im_boffset + offsetof(struct xfs_dinode, di_next_unlinked); - xfs_dinode_calc_crc(mp, dip); + xfs_dinode_calc_crc(tp->t_mountp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); return 0; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 86f14ec7c31f..7db3ece370b1 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -100,7 +100,6 @@ xfs_iwalk_ichunk_ra( struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_agnumber_t agno = pag->pag_agno; xfs_agblock_t agbno; struct blk_plug plug; int i; /* inode chunk index */ @@ -114,7 +113,7 @@ xfs_iwalk_ichunk_ra( imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, agbno), + xfs_agbno_to_daddr(pag, agbno), igeo->blocks_per_cluster * mp->m_bsize, &xfs_inode_buf_ops); } @@ -177,20 +176,19 @@ xfs_iwalk_ag_recs( struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; - xfs_ino_t ino; unsigned int i, j; int error; for (i = 0; i < iwag->nr_recs; i++) { struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; - trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec); + trace_xfs_iwalk_ag_rec(pag, irec); if (xfs_pwork_want_abort(&iwag->pwork)) return 0; if (iwag->inobt_walk_fn) { - error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec, + error = iwag->inobt_walk_fn(mp, tp, pag_agno(pag), irec, iwag->data); if (error) return error; @@ -208,9 +206,10 @@ xfs_iwalk_ag_recs( continue; /* Otherwise call our function. */ - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - irec->ir_startino + j); - error = iwag->iwalk_fn(mp, tp, ino, iwag->data); + error = iwag->iwalk_fn(mp, tp, + xfs_agino_to_ino(pag, + irec->ir_startino + j), + iwag->data); if (error) return error; } @@ -305,7 +304,7 @@ xfs_iwalk_ag_start( return -EFSCORRUPTED; } - iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + iwag->lastino = xfs_agino_to_ino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); /* @@ -406,7 +405,7 @@ xfs_iwalk_ag( int error = 0; /* Set up our cursor at the right place in the inode btree. */ - ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino)); + ASSERT(pag_agno(pag) == XFS_INO_TO_AGNO(mp, iwag->startino)); agino = XFS_INO_TO_AGINO(mp, iwag->startino); error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); @@ -425,7 +424,7 @@ xfs_iwalk_ag( break; /* Make sure that we always move forward. */ - rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); + rec_fsino = xfs_agino_to_ino(pag, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { xfs_btree_mark_sick(cur); @@ -535,6 +534,37 @@ xfs_iwalk_prefetch( return max(inobt_records, 2U); } +static int +xfs_iwalk_args( + struct xfs_iwalk_ag *iwag, + unsigned int flags) +{ + struct xfs_mount *mp = iwag->mp; + xfs_agnumber_t start_agno; + int error; + + start_agno = XFS_INO_TO_AGNO(iwag->mp, iwag->startino); + ASSERT(start_agno < iwag->mp->m_sb.sb_agcount); + ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); + + error = xfs_iwalk_alloc(iwag); + if (error) + return error; + + while ((iwag->pag = xfs_perag_next_from(mp, iwag->pag, start_agno))) { + error = xfs_iwalk_ag(iwag); + if (error || (flags & XFS_IWALK_SAME_AG)) { + xfs_perag_rele(iwag->pag); + break; + } + iwag->startino = + XFS_AGINO_TO_INO(mp, pag_agno(iwag->pag) + 1, 0); + } + + xfs_iwalk_free(iwag); + return error; +} + /* * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn * will be called for each allocated inode, being passed the inode's number and @@ -563,32 +593,8 @@ xfs_iwalk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; - struct xfs_perag *pag; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - int error; - - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); - - error = xfs_iwalk_alloc(&iwag); - if (error) - return error; - - for_each_perag_from(mp, agno, pag) { - iwag.pag = pag; - error = xfs_iwalk_ag(&iwag); - if (error) - break; - iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); - if (flags & XFS_INOBT_WALK_SAME_AG) - break; - iwag.pag = NULL; - } - if (iwag.pag) - xfs_perag_rele(pag); - xfs_iwalk_free(&iwag); - return error; + return xfs_iwalk_args(&iwag, flags); } /* Run per-thread iwalk work. */ @@ -640,19 +646,19 @@ xfs_iwalk_threaded( bool polled, void *data) { + xfs_agnumber_t start_agno = XFS_INO_TO_AGNO(mp, startino); struct xfs_pwork_ctl pctl; - struct xfs_perag *pag; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + struct xfs_perag *pag = NULL; int error; - ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(start_agno < mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); if (error) return error; - for_each_perag_from(mp, agno, pag) { + while ((pag = xfs_perag_next_from(mp, pag, start_agno))) { struct xfs_iwalk_ag *iwag; if (xfs_pwork_ctl_want_abort(&pctl)) @@ -673,8 +679,8 @@ xfs_iwalk_threaded( iwag->sz_recs = xfs_iwalk_prefetch(inode_records); iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); - startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); - if (flags & XFS_INOBT_WALK_SAME_AG) + startino = XFS_AGINO_TO_INO(mp, pag_agno(pag) + 1, 0); + if (flags & XFS_IWALK_SAME_AG) break; } if (pag) @@ -748,30 +754,6 @@ xfs_inobt_walk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; - struct xfs_perag *pag; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - int error; - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL)); - - error = xfs_iwalk_alloc(&iwag); - if (error) - return error; - - for_each_perag_from(mp, agno, pag) { - iwag.pag = pag; - error = xfs_iwalk_ag(&iwag); - if (error) - break; - iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); - if (flags & XFS_INOBT_WALK_SAME_AG) - break; - iwag.pag = NULL; - } - - if (iwag.pag) - xfs_perag_rele(pag); - xfs_iwalk_free(&iwag); - return error; + return xfs_iwalk_args(&iwag, flags); } diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 83699089755e..17a5a2c6debb 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -25,7 +25,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, unsigned int inode_records, bool poll, void *data); -/* Only iterate inodes within the same AG as @startino. */ +/* Only iterate within the same AG as @startino. */ #define XFS_IWALK_SAME_AG (1U << 0) #define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) @@ -41,9 +41,4 @@ int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records, void *data); -/* Only iterate inobt records within the same AG as @startino. */ -#define XFS_INOBT_WALK_SAME_AG (XFS_IWALK_SAME_AG) - -#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG) - #endif /* __XFS_IWALK_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 26b2f5887b88..793468b4d30d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -20,6 +20,7 @@ #include "xfs_sysfs.h" #include "xfs_sb.h" #include "xfs_health.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_log_ticket_cache; @@ -1606,27 +1607,6 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static int -xlog_map_iclog_data( - struct bio *bio, - void *data, - size_t count) -{ - do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - size_t len = min_t(size_t, count, PAGE_SIZE - off); - - if (bio_add_page(bio, page, len, off) != len) - return -EIO; - - data += len; - count -= len; - } while (count); - - return 0; -} - STATIC void xlog_write_iclog( struct xlog *log, @@ -1692,11 +1672,12 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) - goto shutdown; - - if (is_vmalloc_addr(iclog->ic_data)) - flush_kernel_vmap_range(iclog->ic_data, count); + if (is_vmalloc_addr(iclog->ic_data)) { + if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + } else { + bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count); + } /* * If this log buffer would straddle the end of the log we will have @@ -2744,8 +2725,6 @@ xfs_log_ticket_regrant( if (!ticket->t_cnt) { xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res); trace_xfs_log_ticket_regrant_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; } xfs_log_ticket_put(ticket); @@ -2889,7 +2868,7 @@ xlog_force_and_check_iclog( * * 1. the current iclog is active and has no data; the previous iclog * is in the active or dirty state. - * 2. the current iclog is drity, and the previous iclog is in the + * 2. the current iclog is dirty, and the previous iclog is in the * active or dirty state. * * We may sleep if: @@ -3456,6 +3435,16 @@ xlog_force_shutdown( return false; /* + * Ensure that there is only ever one log shutdown being processed. + * If we allow the log force below on a second pass after shutting + * down the log, we risk deadlocking the CIL push as it may require + * locks on objects the current shutdown context holds (e.g. taking + * buffer locks to abort buffers on last unpin of buf log items). + */ + if (test_and_set_bit(XLOG_SHUTDOWN_STARTED, &log->l_opstate)) + return false; + + /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs @@ -3487,6 +3476,7 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); + ASSERT(0); return false; } spin_unlock(&log->l_icloglock); @@ -3531,6 +3521,9 @@ xlog_force_shutdown( spin_unlock(&log->l_icloglock); wake_up_var(&log->l_opstate); + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) + xfs_zoned_wake_all(log->l_mp); + return log_error; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 80da0cf87d7a..f66d2d430e4f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -171,11 +171,8 @@ xlog_cil_insert_pcp_aggregate( */ for_each_cpu(cpu, &ctx->cil_pcpmask) { struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - int old = READ_ONCE(cilpcp->space_used); - while (!try_cmpxchg(&cilpcp->space_used, &old, 0)) - ; - count += old; + count += xchg(&cilpcp->space_used, 0); } atomic_add(count, &ctx->space_used); } @@ -312,9 +309,7 @@ xlog_cil_alloc_shadow_bufs( * Then round nbytes up to 64-bit alignment so that the initial * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * - (sizeof(uint64_t) + sizeof(struct xlog_op_header)); - nbytes = round_up(nbytes, sizeof(uint64_t)); + nbytes = xlog_item_space(niovecs, nbytes); /* * The data buffer needs to start 64-bit aligned, so round up @@ -907,7 +902,7 @@ xlog_cil_committed( xlog_cil_ail_insert(ctx, abort); xfs_extent_busy_sort(&ctx->busy_extents.extent_list); - xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, + xfs_extent_busy_clear(&ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -917,7 +912,6 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); if (!list_empty(&ctx->busy_extents.extent_list)) { - ctx->busy_extents.mount = mp; ctx->busy_extents.owner = ctx; xfs_discard_extents(mp, &ctx->busy_extents); return; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b8778a4fd6b6..39a102cc1b43 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -458,6 +458,7 @@ struct xlog { #define XLOG_IO_ERROR 2 /* log hit an I/O error, and being shutdown */ #define XLOG_TAIL_WARN 3 /* log tail verify warning issued */ +#define XLOG_SHUTDOWN_STARTED 4 /* xlog_force_shutdown() exclusion */ static inline bool xlog_recovery_needed(struct xlog *log) @@ -697,4 +698,17 @@ xlog_kvmalloc( return p; } +/* + * Given a count of iovecs and space for a log item, compute the space we need + * in the log to store that data plus the log headers. + */ +static inline unsigned int +xlog_item_space( + unsigned int niovecs, + unsigned int nbytes) +{ + nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header)); + return round_up(nbytes, sizeof(uint64_t)); +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 704aaadb61cf..2f76531842f8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1818,6 +1818,12 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_attrd_item_ops, &xlog_xmi_item_ops, &xlog_xmd_item_ops, + &xlog_rtefi_item_ops, + &xlog_rtefd_item_ops, + &xlog_rtrui_item_ops, + &xlog_rtrud_item_ops, + &xlog_rtcui_item_ops, + &xlog_rtcud_item_ops, }; static const struct xlog_recover_item_ops * @@ -2677,7 +2683,7 @@ xlog_recover_clear_agi_bucket( struct xfs_perag *pag, int bucket) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_trans *tp; struct xfs_agi *agi; struct xfs_buf *agibp; @@ -2708,7 +2714,7 @@ out_abort: xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, - pag->pag_agno); + pag_agno(pag)); return; } @@ -2718,7 +2724,7 @@ xlog_recover_iunlink_bucket( struct xfs_agi *agi, int bucket) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode *prev_ip = NULL; struct xfs_inode *ip; xfs_agino_t prev_agino, agino; @@ -2726,9 +2732,8 @@ xlog_recover_iunlink_bucket( agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - error = xfs_iget(mp, NULL, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), - 0, 0, &ip); + error = xfs_iget(mp, NULL, xfs_agino_to_ino(pag, agino), 0, 0, + &ip); if (error) break; @@ -2846,10 +2851,9 @@ static void xlog_recover_process_iunlinks( struct xlog *log) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; - for_each_perag(log->l_mp, agno, pag) + while ((pag = xfs_perag_next(log->l_mp, pag))) xlog_recover_iunlink_ag(pag); } @@ -3376,7 +3380,7 @@ xlog_do_recover( */ xfs_buf_lock(bp); xfs_buf_hold(bp); - error = _xfs_buf_read(bp, XBF_READ); + error = _xfs_buf_read(bp); if (error) { if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 8f495cc23903..19aba2c3d525 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -131,3 +131,42 @@ xfs_buf_alert_ratelimited( __xfs_printk(KERN_ALERT, mp, &vaf); va_end(args); } + +void +xfs_warn_experimental( + struct xfs_mount *mp, + enum xfs_experimental_feat feat) +{ + static const struct { + const char *name; + long opstate; + } features[] = { + [XFS_EXPERIMENTAL_SHRINK] = { + .opstate = XFS_OPSTATE_WARNED_SHRINK, + .name = "online shrink", + }, + [XFS_EXPERIMENTAL_LARP] = { + .opstate = XFS_OPSTATE_WARNED_LARP, + .name = "logged extended attributes", + }, + [XFS_EXPERIMENTAL_LBS] = { + .opstate = XFS_OPSTATE_WARNED_LBS, + .name = "large block size", + }, + [XFS_EXPERIMENTAL_METADIR] = { + .opstate = XFS_OPSTATE_WARNED_METADIR, + .name = "metadata directory tree", + }, + [XFS_EXPERIMENTAL_ZONED] = { + .opstate = XFS_OPSTATE_WARNED_ZONED, + .name = "zoned RT device", + }, + }; + ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); + BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); + + if (xfs_should_warn(mp, features[feat].opstate)) + xfs_warn(mp, + "EXPERIMENTAL %s feature enabled. Use at your own risk!", + features[feat].name); +} diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index cc323775a12c..d68e72379f9d 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -75,12 +75,6 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) -#define xfs_warn_mount(mp, warntag, fmt, ...) \ -do { \ - if (xfs_should_warn((mp), (warntag))) \ - xfs_warn((mp), (fmt), ##__VA_ARGS__); \ -} while (0) - #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ @@ -96,4 +90,15 @@ extern void xfs_hex_dump(const void *p, int length); void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, const char *fmt, ...); +enum xfs_experimental_feat { + XFS_EXPERIMENTAL_SHRINK, + XFS_EXPERIMENTAL_LARP, + XFS_EXPERIMENTAL_LBS, + XFS_EXPERIMENTAL_METADIR, + XFS_EXPERIMENTAL_ZONED, + + XFS_EXPERIMENTAL_MAX, +}; +void xfs_warn_experimental(struct xfs_mount *mp, enum xfs_experimental_feat f); + #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 25bbcc3f4ee0..29276fe60df9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -35,7 +35,12 @@ #include "xfs_trace.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" +#include "xfs_zone_alloc.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -177,14 +182,11 @@ xfs_readsb( /* * Allocate a (locked) buffer to hold the superblock. This will be kept - * around at all times to optimize access to the superblock. Therefore, - * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count - * elevated. + * around at all times to optimize access to the superblock. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), XBF_NO_IOACCT, &bp, - buf_ops); + BTOBB(sector_size), &bp, buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -412,7 +414,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "last sector read failed"); return error; @@ -429,7 +431,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSB_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "log device read failed"); return error; @@ -460,22 +462,38 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } +static const char *const xfs_free_pool_name[] = { + [XC_FREE_BLOCKS] = "free blocks", + [XC_FREE_RTEXTENTS] = "free rt extents", + [XC_FREE_RTAVAILABLE] = "available rt extents", +}; + uint64_t -xfs_default_resblks(xfs_mount_t *mp) +xfs_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) { - uint64_t resblks; - - /* - * We default to 5% or 8192 fsbs of space reserved, whichever is - * smaller. This is intended to cover concurrent allocation - * transactions when we initially hit enospc. These each require a 4 - * block reservation. Hence by default we cover roughly 2000 concurrent - * allocation reservations. - */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(uint64_t, resblks, 8192); - return resblks; + switch (ctr) { + case XC_FREE_BLOCKS: + /* + * Default to 5% or 8192 FSBs of space reserved, whichever is + * smaller. + * + * This is intended to cover concurrent allocation transactions + * when we initially hit ENOSPC. These each require a 4 block + * reservation. Hence by default we cover roughly 2000 + * concurrent allocation reservations. + */ + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL); + case XC_FREE_RTEXTENTS: + case XC_FREE_RTAVAILABLE: + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + return xfs_zoned_default_resblks(mp, ctr); + return 0; + default: + ASSERT(0); + return 0; + } } /* Ensure the summary counts are correct. */ @@ -542,7 +560,7 @@ xfs_check_summary_counts( * If we're mounting the rt volume after recovering the log, recompute * frextents from the rtbitmap file to fix the inconsistency. */ - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) { error = xfs_rtalloc_reinit_frextents(mp); if (error) return error; @@ -620,6 +638,22 @@ xfs_mount_setup_inode_geom( xfs_ialloc_setup_geometry(mp); } +/* Mount the metadata directory tree root. */ +STATIC int +xfs_mount_setup_metadir( + struct xfs_mount *mp) +{ + int error; + + /* Load the metadata directory root inode into memory. */ + error = xfs_metafile_iget(mp, mp->m_sb.sb_metadirino, XFS_METAFILE_DIR, + &mp->m_metadirip); + if (error) + xfs_warn(mp, "Failed to load metadir root directory, error %d", + error); + return error; +} + /* Compute maximum possible height for per-AG btree types for this fs. */ static inline void xfs_agbtree_compute_maxlevels( @@ -632,6 +666,167 @@ xfs_agbtree_compute_maxlevels( mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } +/* Maximum atomic write IO size that the kernel allows. */ +static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) +{ + return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); +} + +static inline unsigned int max_pow_of_two_factor(const unsigned int nr) +{ + return 1 << (ffs(nr) - 1); +} + +/* + * If the data device advertises atomic write support, limit the size of data + * device atomic writes to the greatest power-of-two factor of the AG size so + * that every atomic write unit aligns with the start of every AG. This is + * required so that the per-AG allocations for an atomic write will always be + * aligned compatibly with the alignment requirements of the storage. + * + * If the data device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any AG. + */ +static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp) +{ + if (mp->m_ddev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(mp->m_sb.sb_agblocks); + return rounddown_pow_of_two(mp->m_ag_max_usable); +} + +/* + * Reflink on the realtime device requires rtgroups, and atomic writes require + * reflink. + * + * If the realtime device advertises atomic write support, limit the size of + * data device atomic writes to the greatest power-of-two factor of the rtgroup + * size so that every atomic write unit aligns with the start of every rtgroup. + * This is required so that the per-rtgroup allocations for an atomic write + * will always be aligned compatibly with the alignment requirements of the + * storage. + * + * If the rt device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any + * rtgroup. + */ +static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp) +{ + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + if (rgs->blocks == 0) + return 0; + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(rgs->blocks); + return rounddown_pow_of_two(rgs->blocks); +} + +/* Compute the maximum atomic write unit size for each section. */ +static inline void +xfs_calc_atomic_write_unit_max( + struct xfs_mount *mp) +{ + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp); + const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp); + const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp); + + ags->awu_max = min3(max_write, max_ioend, max_agsize); + rgs->awu_max = min3(max_write, max_ioend, max_rgsize); + + trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend, + max_agsize, max_rgsize); +} + +/* + * Try to set the atomic write maximum to a new value that we got from + * userspace via mount option. + */ +int +xfs_set_max_atomic_write_opt( + struct xfs_mount *mp, + unsigned long long new_max_bytes) +{ + const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes); + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_group = + max(mp->m_groups[XG_TYPE_AG].blocks, + mp->m_groups[XG_TYPE_RTG].blocks); + const xfs_extlen_t max_group_write = + max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp)); + int error; + + if (new_max_bytes == 0) + goto set_limit; + + ASSERT(max_write <= U32_MAX); + + /* generic_atomic_write_valid enforces power of two length */ + if (!is_power_of_2(new_max_bytes)) { + xfs_warn(mp, + "max atomic write size of %llu bytes is not a power of 2", + new_max_bytes); + return -EINVAL; + } + + if (new_max_bytes & mp->m_blockmask) { + xfs_warn(mp, + "max atomic write size of %llu bytes not aligned with fsblock", + new_max_bytes); + return -EINVAL; + } + + if (new_max_fsbs > max_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_write) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than allocation group size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group_write) >> 10); + return -EINVAL; + } + +set_limit: + error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); + if (error) { + xfs_warn(mp, + "cannot support completing atomic writes of %lluk", + new_max_bytes >> 10); + return error; + } + + xfs_calc_atomic_write_unit_max(mp); + mp->m_awu_max_bytes = new_max_bytes; + return 0; +} + +/* Compute maximum possible height for realtime btree types for this fs. */ +static inline void +xfs_rtbtree_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels, + mp->m_rtrefc_maxlevels); +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -652,6 +847,7 @@ xfs_mountfs( uint quotamount = 0; uint quotaflags = 0; int error = 0; + int i; xfs_sb_mount_common(mp, sbp); @@ -700,9 +896,12 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_mount_setup_inode_geom(mp); xfs_rmapbt_compute_maxlevels(mp); + xfs_rtrmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); xfs_agbtree_compute_maxlevels(mp); + xfs_rtbtree_compute_maxlevels(mp); /* * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks @@ -718,27 +917,15 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - super_set_sysfs_name_id(mp->m_super); - - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, - NULL, mp->m_super->s_id); + error = xfs_mount_sysfs_init(mp); if (error) - goto out; - - error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, - &mp->m_kobj, "stats"); - if (error) - goto out_remove_sysfs; + goto out_remove_scrub_stats; xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); - error = xfs_error_sysfs_init(mp); - if (error) - goto out_remove_scrub_stats; - error = xfs_errortag_init(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_sysfs; error = xfs_uuid_mount(mp); if (error) @@ -817,10 +1004,17 @@ xfs_mountfs( goto out_free_dir; } + error = xfs_initialize_rtgroups(mp, 0, sbp->sb_rgcount, + mp->m_sb.sb_rextents); + if (error) { + xfs_warn(mp, "Failed rtgroup init: %d", error); + goto out_free_perag; + } + if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { xfs_warn(mp, "no log defined"); error = -EFSCORRUPTED; - goto out_free_perag; + goto out_free_rtgroup; } error = xfs_inodegc_register_shrinker(mp); @@ -828,6 +1022,13 @@ xfs_mountfs( goto out_fail_wait; /* + * If we're resuming quota status, pick up the preliminary qflags from + * the ondisk superblock so that we know if we should recover dquots. + */ + if (xfs_is_resuming_quotaon(mp)) + xfs_qm_resume_quotaon(mp); + + /* * Log's mount-time initialization. The first part of recovery can place * some items on the AIL, to be handled when recovery is finished or * cancelled. @@ -841,6 +1042,14 @@ xfs_mountfs( } /* + * If we're resuming quota status and recovered the log, re-sample the + * qflags from the ondisk superblock now that we've recovered it, just + * in case someone shut down enforcement just before a crash. + */ + if (xfs_clear_resuming_quotaon(mp) && xlog_recovery_needed(mp->m_log)) + xfs_qm_resume_quotaon(mp); + + /* * If logged xattrs are still enabled after log recovery finishes, then * they'll be available until unmount. Otherwise, turn them off. */ @@ -866,6 +1075,12 @@ xfs_mountfs( mp->m_features |= XFS_FEAT_ATTR2; } + if (xfs_has_metadir(mp)) { + error = xfs_mount_setup_metadir(mp); + if (error) + goto out_free_metadir; + } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -876,7 +1091,7 @@ xfs_mountfs( xfs_warn(mp, "Failed to read root inode 0x%llx, error %d", sbp->sb_rootino, -error); - goto out_log_dealloc; + goto out_free_metadir; } ASSERT(rip != NULL); @@ -974,6 +1189,12 @@ xfs_mountfs( if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); + if (xfs_has_zoned(mp)) { + error = xfs_mount_zones(mp); + if (error) + goto out_rtunmount; + } + /* * Complete the quota initialisation, post-log-replay component. */ @@ -989,35 +1210,55 @@ xfs_mountfs( * privileged transactions. This is needed so that transaction * space required for critical operations can dip into this pool * when at ENOSPC. This is needed for operations like create with - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations - * are not allowed to use this reserved space. + * attr, unwritten extent conversion at ENOSPC, garbage collection + * etc. Data allocations are not allowed to use this reserved space. * * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); - if (error) - xfs_warn(mp, - "Unable to allocate reserve blocks. Continuing without reserve pool."); + for (i = 0; i < XC_FREE_NR; i++) { + error = xfs_reserve_blocks(mp, i, + xfs_default_resblks(mp, i)); + if (error) + xfs_warn(mp, +"Unable to allocate reserve blocks. Continuing without reserve pool for %s.", + xfs_free_pool_name[i]); + } /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) goto out_agresv; + + xfs_zone_gc_start(mp); } + /* + * Pre-calculate atomic write unit max. This involves computations + * derived from transaction reservations, so we must do this after the + * log is fully initialized. + */ + error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes); + if (error) + goto out_agresv; + return 0; out_agresv: xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + out_free_metadir: + if (mp->m_metadirip) + xfs_irele(mp->m_metadirip); /* * Inactivate all inodes that might still be in memory after a log @@ -1039,7 +1280,6 @@ xfs_mountfs( * quota inodes. */ xfs_unmount_flush_inodes(mp); - out_log_dealloc: xfs_log_mount_cancel(mp); out_inodegc_shrinker: shrinker_free(mp->m_inodegc_shrinker); @@ -1047,6 +1287,8 @@ xfs_mountfs( if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); xfs_buftarg_drain(mp->m_ddev_targp); + out_free_rtgroup: + xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); out_free_perag: xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); out_free_dir: @@ -1055,13 +1297,10 @@ xfs_mountfs( xfs_uuid_unmount(mp); out_remove_errortag: xfs_errortag_del(mp); - out_remove_error_sysfs: - xfs_error_sysfs_del(mp); + out_remove_sysfs: + xfs_mount_sysfs_del(mp); out_remove_scrub_stats: xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - out_remove_sysfs: - xfs_sysfs_del(&mp->m_kobj); out: return error; } @@ -1087,10 +1326,16 @@ xfs_unmountfs( xfs_inodegc_flush(mp); xfs_blockgc_stop(mp); + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate)) + xfs_zone_gc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); + if (mp->m_metadirip) + xfs_irele(mp->m_metadirip); xfs_unmount_flush_inodes(mp); @@ -1110,7 +1355,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - error = xfs_reserve_blocks(mp, 0); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); @@ -1129,12 +1374,11 @@ xfs_unmountfs( xfs_errortag_clearall(mp); #endif shrinker_free(mp->m_inodegc_shrinker); + xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); - xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - xfs_sysfs_del(&mp->m_kobj); + xfs_mount_sysfs_del(mp); } /* @@ -1156,52 +1400,67 @@ xfs_fs_writable( return true; } +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +uint64_t +xfs_freecounter_unavailable( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + if (ctr != XC_FREE_BLOCKS) + return 0; + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + void xfs_add_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta) { - bool has_resv_pool = (counter == &mp->m_fdblocks); + struct xfs_freecounter *counter = &mp->m_free[ctr]; uint64_t res_used; /* * If the reserve pool is depleted, put blocks back into it first. * Most of the time the pool is full. */ - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { - percpu_counter_add(counter, delta); + if (likely(counter->res_avail == counter->res_total)) { + percpu_counter_add(&counter->count, delta); return; } spin_lock(&mp->m_sb_lock); - res_used = mp->m_resblks - mp->m_resblks_avail; + res_used = counter->res_total - counter->res_avail; if (res_used > delta) { - mp->m_resblks_avail += delta; + counter->res_avail += delta; } else { delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); + counter->res_avail = counter->res_total; + percpu_counter_add(&counter->count, delta); } spin_unlock(&mp->m_sb_lock); } + +/* Adjust in-core free blocks or RT extents. */ int xfs_dec_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta, bool rsvd) { - int64_t lcounter; - uint64_t set_aside = 0; + struct xfs_freecounter *counter = &mp->m_free[ctr]; s32 batch; - bool has_resv_pool; - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); - has_resv_pool = (counter == &mp->m_fdblocks); - if (rsvd) - ASSERT(has_resv_pool); + ASSERT(ctr < XC_FREE_NR); /* * Taking blocks away, need to be more accurate the closer we @@ -1211,7 +1470,7 @@ xfs_dec_freecounter( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1228,34 +1487,34 @@ xfs_dec_freecounter( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - if (has_resv_pool) - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, -((int64_t)delta), batch); - if (__percpu_counter_compare(counter, set_aside, - XFS_FDBLOCKS_BATCH) >= 0) { - /* we had space! */ - return 0; - } - - /* - * lock up the sb for dipping into reserves before releasing the space - * that took us to ENOSPC. - */ - spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, delta); - if (!has_resv_pool || !rsvd) - goto fdblocks_enospc; - - lcounter = (long long)mp->m_resblks_avail - delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; + percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch); + if (__percpu_counter_compare(&counter->count, + xfs_freecounter_unavailable(mp, ctr), + XFS_FDBLOCKS_BATCH) < 0) { + /* + * Lock up the sb for dipping into reserves before releasing the + * space that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&counter->count, delta); + if (!rsvd) + goto fdblocks_enospc; + if (delta > counter->res_avail) { + if (ctr == XC_FREE_BLOCKS) + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + goto fdblocks_enospc; + } + counter->res_avail -= delta; + trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); - return 0; } - xfs_warn_once(mp, -"Reserve blocks depleted! Consider increasing reserve pool size."); + + /* we had space! */ + return 0; fdblocks_enospc: + trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); return -ENOSPC; } @@ -1436,7 +1695,7 @@ xfs_mod_delalloc( if (XFS_IS_REALTIME_INODE(ip)) { percpu_counter_add_batch(&mp->m_delalloc_rtextents, - xfs_rtb_to_rtx(mp, data_delta), + xfs_blen_to_rtbxlen(mp, data_delta), XFS_DELALLOC_BATCH); if (!ind_delta) return; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 96496f39f551..d85084f9f317 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -72,6 +72,76 @@ struct xfs_inodegc { }; /* + * Container for each type of groups, used to look up individual groups and + * describes the geometry. + */ +struct xfs_groups { + struct xarray xa; + + /* + * Maximum capacity of the group in FSBs. + * + * Each group is laid out densely in the daddr space. For the + * degenerate case of a pre-rtgroups filesystem, the incore rtgroup + * pretends to have a zero-block and zero-blklog rtgroup. + */ + uint32_t blocks; + + /* + * Log(2) of the logical size of each group. + * + * Compared to the blocks field above this is rounded up to the next + * power of two, and thus lays out the xfs_fsblock_t/xfs_rtblock_t + * space sparsely with a hole from blocks to (1 << blklog) at the end + * of each group. + */ + uint8_t blklog; + + /* + * Zoned devices can have gaps beyond the usable capacity of a zone and + * the end in the LBA/daddr address space. In other words, the hardware + * equivalent to the RT groups already takes care of the power of 2 + * alignment for us. In this case the sparse FSB/RTB address space maps + * 1:1 to the device address space. + */ + bool has_daddr_gaps; + + /* + * Mask to extract the group-relative block number from a FSB. + * For a pre-rtgroups filesystem we pretend to have one very large + * rtgroup, so this mask must be 64-bit. + */ + uint64_t blkmask; + + /* + * Start of the first group in the device. This is used to support a + * RT device following the data device on the same block device for + * SMR hard drives. + */ + xfs_fsblock_t start_fsb; + + /* + * Maximum length of an atomic write for files stored in this + * collection of allocation groups, in fsblocks. + */ + xfs_extlen_t awu_max; +}; + +struct xfs_freecounter { + /* free blocks for general use: */ + struct percpu_counter count; + + /* total reserved blocks: */ + uint64_t res_total; + + /* available reserved blocks: */ + uint64_t res_avail; + + /* reserved blks @ remount,ro: */ + uint64_t res_saved; +}; + +/* * The struct xfsmount layout is optimised to separate read-mostly variables * from variables that are frequently modified. We put the read-mostly variables * first, then place all the other variables at the end. @@ -85,27 +155,20 @@ typedef struct xfs_mount { struct super_block *m_super; struct xfs_ail *m_ail; /* fs active log item list */ struct xfs_buf *m_sb_bp; /* buffer for superblock */ + struct xfs_buf *m_rtsb_bp; /* realtime superblock */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ - struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ - struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ + struct xfs_inode *m_metadirip; /* ptr to metadata directory */ + struct xfs_inode *m_rtdirip; /* ptr to realtime metadir */ struct xfs_quotainfo *m_quotainfo; /* disk quota information */ struct xfs_buftarg *m_ddev_targp; /* data device */ struct xfs_buftarg *m_logdev_targp;/* log device */ struct xfs_buftarg *m_rtdev_targp; /* rt device */ void __percpu *m_inodegc; /* percpu inodegc structures */ - - /* - * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] > the maximum i for which - * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i. - * Reads and writes are serialized by the rsumip inode lock. - */ - uint8_t *m_rsum_cache; struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; @@ -120,22 +183,31 @@ typedef struct xfs_mount { uint8_t m_agno_log; /* log #ag's */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ int8_t m_rtxblklog; /* log2 of rextsize, if possible */ + uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ - uint m_blockwmask; /* blockwsize-1 */ + /* number of rt extents per rt bitmap block if rtgroups enabled */ + unsigned int m_rtx_per_rbmblock; uint m_alloc_mxr[2]; /* max alloc btree records */ uint m_alloc_mnr[2]; /* min alloc btree records */ uint m_bmap_dmxr[2]; /* max bmap btree records */ uint m_bmap_dmnr[2]; /* min bmap btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ + uint m_rtrmap_mxr[2]; /* max rtrmap btree records */ + uint m_rtrmap_mnr[2]; /* min rtrmap btree records */ uint m_refc_mxr[2]; /* max refc btree records */ uint m_refc_mnr[2]; /* min refc btree records */ + uint m_rtrefc_mxr[2]; /* max rtrefc btree records */ + uint m_rtrefc_mnr[2]; /* min rtrefc btree records */ uint m_alloc_maxlevels; /* max alloc btree levels */ uint m_bm_maxlevels[2]; /* max bmap btree levels */ uint m_rmap_maxlevels; /* max rmap btree levels */ + uint m_rtrmap_maxlevels; /* max rtrmap btree level */ uint m_refc_maxlevels; /* max refcount btree level */ + uint m_rtrefc_maxlevels; /* max rtrefc btree level */ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */ + unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ @@ -146,7 +218,7 @@ typedef struct xfs_mount { uint m_allocsize_blocks; /* min write size blocks */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ - uint m_rsumlevels; /* rt summary levels */ + unsigned int m_rsumlevels; /* rt summary levels */ xfs_filblks_t m_rsumblocks; /* size of rt summary, FSBs */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ @@ -162,6 +234,12 @@ typedef struct xfs_mount { bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ + unsigned int m_max_open_zones; + unsigned int m_zonegc_low_space; + struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ + + /* max_atomic_write mount option value */ + unsigned long long m_awu_max_bytes; /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -186,8 +264,8 @@ typedef struct xfs_mount { spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - struct percpu_counter m_frextents; /* free rt extent counter */ + + struct xfs_freecounter m_free[XC_FREE_NR]; /* * Count of data device blocks reserved for delayed allocations, @@ -208,11 +286,9 @@ typedef struct xfs_mount { */ atomic64_t m_allocbt_blks; - struct xarray m_perags; /* per-ag accounting info */ - uint64_t m_resblks; /* total reserved blocks */ - uint64_t m_resblks_avail;/* available reserved blocks */ - uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + struct xfs_groups m_groups[XG_TYPE_MAX]; struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct xfs_zone_info *m_zone_info; /* zone allocator information */ struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; @@ -222,8 +298,15 @@ typedef struct xfs_mount { #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS struct xchk_stats *m_scrub_stats; #endif + struct xfs_kobj m_zoned_kobj; xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ + atomic_t m_rtgrotor; /* last rtgroup rtpicked */ + + struct mutex m_metafile_resv_lock; + uint64_t m_metafile_resv_target; + uint64_t m_metafile_resv_used; + uint64_t m_metafile_resv_avail; /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; @@ -298,8 +381,11 @@ typedef struct xfs_mount { #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ +#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ +#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */ /* Mount features */ +#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ @@ -319,7 +405,7 @@ typedef struct xfs_mount { #define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */ #define __XFS_HAS_FEAT(name, NAME) \ -static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ +static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \ { \ return mp->m_features & XFS_FEAT_ ## NAME; \ } @@ -353,6 +439,45 @@ __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) +__XFS_HAS_FEAT(metadir, METADIR) +__XFS_HAS_FEAT(zoned, ZONED) +__XFS_HAS_FEAT(nolifetime, NOLIFETIME) + +static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) +{ + /* all metadir file systems also allow rtgroups */ + return xfs_has_metadir(mp); +} + +static inline bool xfs_has_rtsb(const struct xfs_mount *mp) +{ + /* all rtgroups filesystems with an rt section have an rtsb */ + return xfs_has_rtgroups(mp) && + xfs_has_realtime(mp) && + !xfs_has_zoned(mp); +} + +static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) +{ + return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) && + xfs_has_rmapbt(mp); +} + +static inline bool xfs_has_rtreflink(const struct xfs_mount *mp) +{ + return xfs_has_metadir(mp) && xfs_has_realtime(mp) && + xfs_has_reflink(mp); +} + +static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) +{ + return !xfs_has_zoned(mp); +} + +static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp) +{ + return xfs_has_reflink(mp); +} /* * Some features are always on for v5 file systems, allow the compiler to @@ -433,18 +558,26 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 -/* Kernel has logged a warning about online fsck being used on this fs. */ -#define XFS_OPSTATE_WARNED_SCRUB 7 /* Kernel has logged a warning about shrink being used on this fs. */ -#define XFS_OPSTATE_WARNED_SHRINK 8 +#define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ -#define XFS_OPSTATE_WARNED_LARP 9 +#define XFS_OPSTATE_WARNED_LARP 10 /* Mount time quotacheck is running */ -#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 +#define XFS_OPSTATE_QUOTACHECK_RUNNING 11 /* Do we want to clear log incompat flags? */ -#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11 +#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 12 /* Filesystem can use logged extended attributes */ -#define XFS_OPSTATE_USE_LARP 12 +#define XFS_OPSTATE_USE_LARP 13 +/* Kernel has logged a warning about blocksize > pagesize on this fs. */ +#define XFS_OPSTATE_WARNED_LBS 14 +/* Kernel has logged a warning about metadata dirs being used on this fs. */ +#define XFS_OPSTATE_WARNED_METADIR 17 +/* Filesystem should use qflags to determine quotaon status */ +#define XFS_OPSTATE_RESUMING_QUOTAON 18 +/* Kernel has logged a warning about zoned RT device being used on this fs. */ +#define XFS_OPSTATE_WARNED_ZONED 19 +/* (Zoned) GC is in progress */ +#define XFS_OPSTATE_ZONEGC_RUNNING 20 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -469,11 +602,27 @@ __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #ifdef CONFIG_XFS_QUOTA __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) +__XFS_IS_OPSTATE(resuming_quotaon, RESUMING_QUOTAON) #else -# define xfs_is_quotacheck_running(mp) (false) -#endif +static inline bool xfs_is_quotacheck_running(struct xfs_mount *mp) +{ + return false; +} +static inline bool xfs_is_resuming_quotaon(struct xfs_mount *mp) +{ + return false; +} +static inline void xfs_set_resuming_quotaon(struct xfs_mount *m) +{ +} +static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) +{ + return false; +} +#endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) +__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -489,7 +638,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ - { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ @@ -543,7 +691,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } extern void xfs_uuid_table_free(void); -extern uint64_t xfs_default_resblks(xfs_mount_t *mp); +uint64_t xfs_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); @@ -556,45 +705,74 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp, + enum xfs_free_counter ctr); + /* - * Estimate the amount of free space that is not available to userspace and is - * not explicitly reserved from the incore fdblocks. This includes: - * - * - The minimum number of blocks needed to support splitting a bmap btree - * - The blocks currently in use by the freespace btrees because they record - * the actual blocks that will fill per-AG metadata space reservations + * Sum up the freecount, but never return negative values. + */ +static inline s64 xfs_sum_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum_positive(&mp->m_free[ctr].count); +} + +/* + * Same as above, but does return negative values. Mostly useful for + * special cases like repair and tracing. + */ +static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum(&mp->m_free[ctr].count); +} + +/* + * This just provides and estimate without the cpu-local updates, use + * xfs_sum_freecounter for the exact value. */ -static inline uint64_t -xfs_fdblocks_unavailable( - struct xfs_mount *mp) +static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_read_positive(&mp->m_free[ctr].count); +} + +static inline int xfs_compare_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, s64 rhs, s32 batch) +{ + return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch); +} + +static inline void xfs_set_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t val) { - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + percpu_counter_set(&mp->m_free[ctr].count, val); } -int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta, bool rsvd); -void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta); static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, bool reserved) { - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved); } static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_fdblocks, delta); + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta); } static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false); } static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_frextents, delta); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -616,5 +794,12 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) +{ + percpu_counter_add(&mp->m_delalloc_blks, delta); +} + +int xfs_set_max_atomic_write_opt(struct xfs_mount *mp, + unsigned long long new_max_bytes); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index d0f5b403bdbe..08443ceec329 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -414,6 +414,8 @@ xfs_mru_cache_destroy( * To insert an element, call xfs_mru_cache_insert() with the data store, the * element's key and the client data pointer. This function returns 0 on * success or ENOMEM if memory for the data element couldn't be allocated. + * + * The passed in elem is freed through the per-cache free_func on failure. */ int xfs_mru_cache_insert( @@ -421,14 +423,15 @@ xfs_mru_cache_insert( unsigned long key, struct xfs_mru_cache_elem *elem) { - int error; + int error = -EINVAL; ASSERT(mru && mru->lists); if (!mru || !mru->lists) - return -EINVAL; + goto out_free; + error = -ENOMEM; if (radix_tree_preload(GFP_KERNEL)) - return -ENOMEM; + goto out_free; INIT_LIST_HEAD(&elem->list_node); elem->key = key; @@ -440,6 +443,12 @@ xfs_mru_cache_insert( _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); + if (error) + goto out_free; + return 0; + +out_free: + mru->free_func(mru->data, elem); return error; } diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index fa50e5308292..3545dc1d953c 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -19,6 +19,9 @@ #include "xfs_rtalloc.h" #include "xfs_trans.h" #include "xfs_ag.h" +#include "xfs_notify_failure.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" #include <linux/mm.h> #include <linux/dax.h> @@ -124,7 +127,7 @@ xfs_dax_notify_failure_freeze( struct super_block *sb = mp->m_super; int error; - error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL); if (error) xfs_emerg(mp, "already frozen by kernel, err=%d", error); @@ -140,7 +143,7 @@ xfs_dax_notify_failure_thaw( int error; if (kernel_frozen) { - error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL); if (error) xfs_emerg(mp, "still frozen after notify failure, err=%d", error); @@ -150,27 +153,119 @@ xfs_dax_notify_failure_thaw( * Also thaw userspace call anyway because the device is about to be * removed immediately. */ - thaw_super(sb, FREEZE_HOLDER_USERSPACE); + thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int -xfs_dax_notify_ddev_failure( +xfs_dax_translate_range( + struct xfs_buftarg *btp, + u64 offset, + u64 len, + xfs_daddr_t *daddr, + uint64_t *bblen) +{ + u64 dev_start = btp->bt_dax_part_off; + u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_end = dev_start + dev_len - 1; + + /* Notify failure on the whole device. */ + if (offset == 0 && len == U64_MAX) { + offset = dev_start; + len = dev_len; + } + + /* Ignore the range out of filesystem area */ + if (offset + len - 1 < dev_start) + return -ENXIO; + if (offset > dev_end) + return -ENXIO; + + /* Calculate the real range when it touches the boundary */ + if (offset > dev_start) + offset -= dev_start; + else { + len -= dev_start - offset; + offset = 0; + } + if (offset + len - 1 > dev_end) + len = dev_end - offset + 1; + + *daddr = BTOBB(offset); + *bblen = BTOBB(len); + return 0; +} + +static int +xfs_dax_notify_logdev_failure( struct xfs_mount *mp, - xfs_daddr_t daddr, - xfs_daddr_t bblen, + u64 offset, + u64 len, int mf_flags) { + xfs_daddr_t daddr; + uint64_t bblen; + int error; + + /* + * Return ENXIO instead of shutting down the filesystem if the failed + * region is beyond the end of the log. + */ + error = xfs_dax_translate_range(mp->m_logdev_targp, + offset, len, &daddr, &bblen); + if (error) + return error; + + /* + * In the pre-remove case the failure notification is attempting to + * trigger a force unmount. The expectation is that the device is + * still present, but its removal is in progress and can not be + * cancelled, proceed with accessing the log device. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + return 0; + + xfs_err(mp, "ondisk log corrupt, shutting down fs!"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; +} + +static int +xfs_dax_notify_dev_failure( + struct xfs_mount *mp, + u64 offset, + u64 len, + int mf_flags, + enum xfs_group_type type) +{ struct xfs_failure_info notify = { .mf_flags = mf_flags }; struct xfs_trans *tp = NULL; struct xfs_btree_cur *cur = NULL; - struct xfs_buf *agf_bp = NULL; int error = 0; bool kernel_frozen = false; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); - xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, - daddr + bblen - 1); - xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + uint32_t start_gno, end_gno; + xfs_fsblock_t start_bno, end_bno; + xfs_daddr_t daddr; + uint64_t bblen; + struct xfs_group *xg = NULL; + + if (!xfs_has_rmapbt(mp)) { + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); + return -EOPNOTSUPP; + } + + error = xfs_dax_translate_range(type == XG_TYPE_RTG ? + mp->m_rtdev_targp : mp->m_ddev_targp, + offset, len, &daddr, &bblen); + if (error) + return error; + + if (type == XG_TYPE_RTG) { + start_bno = xfs_daddr_to_rtb(mp, daddr); + end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); + } else { + start_bno = XFS_DADDR_TO_FSB(mp, daddr); + end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); + } if (mf_flags & MF_MEM_PRE_REMOVE) { xfs_info(mp, "Device is about to be removed!"); @@ -189,46 +284,58 @@ xfs_dax_notify_ddev_failure( if (error) goto out; - for (; agno <= end_agno; agno++) { + start_gno = xfs_fsb_to_gno(mp, start_bno, type); + end_gno = xfs_fsb_to_gno(mp, end_bno, type); + while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { + struct xfs_buf *agf_bp = NULL; + struct xfs_rtgroup *rtg = NULL; struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; - struct xfs_agf *agf; - struct xfs_perag *pag; - xfs_agblock_t range_agend; - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); - if (error) { - xfs_perag_put(pag); - break; - } + if (type == XG_TYPE_AG) { + struct xfs_perag *pag = to_perag(xg); - cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) { + xfs_perag_put(pag); + break; + } + + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); + } else { + rtg = to_rtg(xg); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + } /* * Set the rmap range from ri_low to ri_high, which represents * a [start, end] where we looking for the files or metadata. */ memset(&ri_high, 0xFF, sizeof(ri_high)); - ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); - if (agno == end_agno) - ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); + if (xg->xg_gno == start_gno) + ri_low.rm_startblock = + xfs_fsb_to_gbno(mp, start_bno, type); + if (xg->xg_gno == end_gno) + ri_high.rm_startblock = + xfs_fsb_to_gbno(mp, end_bno, type); - agf = agf_bp->b_addr; - range_agend = min(be32_to_cpu(agf->agf_length) - 1, - ri_high.rm_startblock); notify.startblock = ri_low.rm_startblock; - notify.blockcount = range_agend + 1 - ri_low.rm_startblock; + notify.blockcount = min(xg->xg_block_count, + ri_high.rm_startblock + 1) - + ri_low.rm_startblock; error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_dax_failure_fn, ¬ify); xfs_btree_del_cursor(cur, error); - xfs_trans_brelse(tp, agf_bp); - xfs_perag_put(pag); - if (error) + if (agf_bp) + xfs_trans_brelse(tp, agf_bp); + if (rtg) + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + if (error) { + xfs_group_put(xg); break; - - fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); + } } xfs_trans_cancel(tp); @@ -263,67 +370,20 @@ xfs_dax_notify_failure( int mf_flags) { struct xfs_mount *mp = dax_holder(dax_dev); - u64 ddev_start; - u64 ddev_end; if (!(mp->m_super->s_flags & SB_BORN)) { xfs_warn(mp, "filesystem is not ready for notify_failure()!"); return -EIO; } - if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { - xfs_debug(mp, - "notify_failure() not supported on realtime device!"); - return -EOPNOTSUPP; - } - - if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && - mp->m_logdev_targp != mp->m_ddev_targp) { - /* - * In the pre-remove case the failure notification is attempting - * to trigger a force unmount. The expectation is that the - * device is still present, but its removal is in progress and - * can not be cancelled, proceed with accessing the log device. - */ - if (mf_flags & MF_MEM_PRE_REMOVE) - return 0; - xfs_err(mp, "ondisk log corrupt, shutting down fs!"); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); - return -EFSCORRUPTED; - } - - if (!xfs_has_rmapbt(mp)) { - xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); - return -EOPNOTSUPP; - } - - ddev_start = mp->m_ddev_targp->bt_dax_part_off; - ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; - - /* Notify failure on the whole device. */ - if (offset == 0 && len == U64_MAX) { - offset = ddev_start; - len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); - } - - /* Ignore the range out of filesystem area */ - if (offset + len - 1 < ddev_start) - return -ENXIO; - if (offset > ddev_end) - return -ENXIO; - - /* Calculate the real range when it touches the boundary */ - if (offset > ddev_start) - offset -= ddev_start; - else { - len -= ddev_start - offset; - offset = 0; + if (mp->m_logdev_targp != mp->m_ddev_targp && + mp->m_logdev_targp->bt_daxdev == dax_dev) { + return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags); } - if (offset + len - 1 > ddev_end) - len = ddev_end - offset + 1; - return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), - mf_flags); + return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags, + (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ? + XG_TYPE_RTG : XG_TYPE_AG); } const struct dax_holder_operations xfs_dax_holder_operations = { diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h new file mode 100644 index 000000000000..8d08ec29dd29 --- /dev/null +++ b/fs/xfs/xfs_notify_failure.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_NOTIFY_FAILURE_H__ +#define __XFS_NOTIFY_FAILURE_H__ + +extern const struct dax_holder_operations xfs_dax_holder_operations; + +#endif /* __XFS_NOTIFY_FAILURE_H__ */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 23d16186e1a3..afe7497012d4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,9 +58,6 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - xfs_notice_once(mp, -"Using experimental pNFS feature, use at your own risk!"); - if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 7e2307921deb..417439b58785 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -27,6 +27,9 @@ #include "xfs_ialloc.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_da_format.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" /* * The global quota manager. There is only one of these for the entire @@ -37,7 +40,6 @@ STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp); STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp); -STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -146,17 +148,29 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (error == -EAGAIN) { + xfs_dqfunlock(dqp); + dqp->q_flags &= ~XFS_DQFLAG_FREEING; + goto out_unlock; + } + if (!bp) + goto out_funlock; + + /* + * dqflush completes dqflock on error, and the bwrite ioend + * does it on success. + */ + error = xfs_qm_dqflush(dqp, bp); if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); - } else if (error == -EAGAIN) { - dqp->q_flags &= ~XFS_DQFLAG_FREEING; - goto out_unlock; } xfs_dqflock(dqp); } + xfs_dquot_detach_buf(dqp); +out_funlock: ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); @@ -208,6 +222,43 @@ xfs_qm_unmount( } } +static void +xfs_qm_unmount_rt( + struct xfs_mount *mp) +{ + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, 0); + + if (!rtg) + return; + if (rtg_bitmap(rtg)) + xfs_qm_dqdetach(rtg_bitmap(rtg)); + if (rtg_summary(rtg)) + xfs_qm_dqdetach(rtg_summary(rtg)); + xfs_rtgroup_rele(rtg); +} + +STATIC void +xfs_qm_destroy_quotainos( + struct xfs_quotainfo *qi) +{ + if (qi->qi_uquotaip) { + xfs_irele(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + xfs_irele(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + xfs_irele(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } + if (qi->qi_dirip) { + xfs_irele(qi->qi_dirip); + qi->qi_dirip = NULL; + } +} + /* * Called from the vfsops layer. */ @@ -221,28 +272,19 @@ xfs_qm_unmount_quotas( */ ASSERT(mp->m_rootip); xfs_qm_dqdetach(mp->m_rootip); - if (mp->m_rbmip) - xfs_qm_dqdetach(mp->m_rbmip); - if (mp->m_rsumip) - xfs_qm_dqdetach(mp->m_rsumip); + + /* + * For pre-RTG file systems, the RT inodes have quotas attached, + * detach them now. + */ + if (!xfs_has_rtgroups(mp)) + xfs_qm_unmount_rt(mp); /* * Release the quota inodes. */ - if (mp->m_quotainfo) { - if (mp->m_quotainfo->qi_uquotaip) { - xfs_irele(mp->m_quotainfo->qi_uquotaip); - mp->m_quotainfo->qi_uquotaip = NULL; - } - if (mp->m_quotainfo->qi_gquotaip) { - xfs_irele(mp->m_quotainfo->qi_gquotaip); - mp->m_quotainfo->qi_gquotaip = NULL; - } - if (mp->m_quotainfo->qi_pquotaip) { - xfs_irele(mp->m_quotainfo->qi_pquotaip); - mp->m_quotainfo->qi_pquotaip = NULL; - } - } + if (mp->m_quotainfo) + xfs_qm_destroy_quotainos(mp->m_quotainfo); } STATIC int @@ -302,6 +344,8 @@ xfs_qm_need_dqattach( return false; if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; + if (xfs_is_metadir_inode(ip)) + return false; return true; } @@ -324,6 +368,7 @@ xfs_qm_dqattach_locked( return 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + ASSERT(!xfs_is_metadir_inode(ip)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, @@ -383,6 +428,8 @@ void xfs_qm_dqdetach( xfs_inode_t *ip) { + if (xfs_is_metadir_inode(ip)) + return; if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) return; @@ -412,9 +459,8 @@ static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) - __releases(lru_lock) __acquires(lru_lock) + __releases(&lru->lock) __acquires(&lru->lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); @@ -460,9 +506,19 @@ xfs_qm_dquot_isolate( trace_xfs_dqreclaim_dirty(dqp); /* we have to drop the LRU lock to flush the dquot */ - spin_unlock(lru_lock); + spin_unlock(&lru->lock); + + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (!bp || error == -EAGAIN) { + xfs_dqfunlock(dqp); + goto out_unlock_dirty; + } - error = xfs_qm_dqflush(dqp, &bp); + /* + * dqflush completes dqflock on error, and the delwri ioend + * does it on success. + */ + error = xfs_qm_dqflush(dqp, bp); if (error) goto out_unlock_dirty; @@ -470,6 +526,8 @@ xfs_qm_dquot_isolate( xfs_buf_relse(bp); goto out_unlock_dirty; } + + xfs_dquot_detach_buf(dqp); xfs_dqfunlock(dqp); /* @@ -496,7 +554,6 @@ out_unlock_dirty: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); - spin_lock(lru_lock); return LRU_RETRY; } @@ -616,6 +673,163 @@ xfs_qm_init_timelimits( xfs_qm_dqdestroy(dqp); } +static int +xfs_qm_load_metadir_qinos( + struct xfs_mount *mp, + struct xfs_quotainfo *qi) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + error = xfs_dqinode_load_parent(tp, &qi->qi_dirip); + if (error == -ENOENT) { + /* no quota dir directory, but we'll create one later */ + error = 0; + goto out_trans; + } + if (error) + goto out_trans; + + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_USER, + &qi->qi_uquotaip); + if (error && error != -ENOENT) + goto out_trans; + } + + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_GROUP, + &qi->qi_gquotaip); + if (error && error != -ENOENT) + goto out_trans; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_PROJ, + &qi->qi_pquotaip); + if (error && error != -ENOENT) + goto out_trans; + } + + error = 0; +out_trans: + xfs_trans_cancel(tp); + return error; +} + +/* Create quota inodes in the metadata directory tree. */ +STATIC int +xfs_qm_create_metadir_qinos( + struct xfs_mount *mp, + struct xfs_quotainfo *qi) +{ + int error; + + if (!qi->qi_dirip) { + error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip); + if (error && error != -EEXIST) + return error; + /* + * If the /quotas dirent points to an inode that isn't + * loadable, qi_dirip will be NULL but mkdir_parent will return + * -EEXIST. In this case the metadir is corrupt, so bail out. + */ + if (XFS_IS_CORRUPT(mp, qi->qi_dirip == NULL)) + return -EFSCORRUPTED; + } + + if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) { + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_USER, &qi->qi_uquotaip); + if (error) + return error; + } + + if (XFS_IS_GQUOTA_ON(mp) && !qi->qi_gquotaip) { + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_GROUP, &qi->qi_gquotaip); + if (error) + return error; + } + + if (XFS_IS_PQUOTA_ON(mp) && !qi->qi_pquotaip) { + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_PROJ, &qi->qi_pquotaip); + if (error) + return error; + } + + return 0; +} + +/* + * Add QUOTABIT to sb_versionnum and initialize qflags in preparation for + * creating quota files on a metadir filesystem. + */ +STATIC int +xfs_qm_prep_metadir_sb( + struct xfs_mount *mp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + + xfs_add_quota(mp); + + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; + + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + + return xfs_trans_commit(tp); +} + +/* + * Load existing quota inodes or create them. Since this is a V5 filesystem, + * we don't have to deal with the grp/prjquota switcheroo thing from V4. + */ +STATIC int +xfs_qm_init_metadir_qinos( + struct xfs_mount *mp) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + int error; + + if (!xfs_has_quota(mp)) { + error = xfs_qm_prep_metadir_sb(mp); + if (error) + return error; + } + + error = xfs_qm_load_metadir_qinos(mp, qi); + if (error) + goto out_err; + + error = xfs_qm_create_metadir_qinos(mp, qi); + if (error) + goto out_err; + + /* The only user of the quota dir inode is online fsck */ +#if !IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) + xfs_irele(qi->qi_dirip); + qi->qi_dirip = NULL; +#endif + return 0; +out_err: + xfs_qm_destroy_quotainos(mp->m_quotainfo); + return error; +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -640,7 +854,10 @@ xfs_qm_init_quotainfo( * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - error = xfs_qm_init_quotainos(mp); + if (xfs_has_metadir(mp)) + error = xfs_qm_init_metadir_qinos(mp); + else + error = xfs_qm_init_quotainos(mp); if (error) goto out_free_lru; @@ -733,6 +950,17 @@ xfs_qm_destroy_quotainfo( mp->m_quotainfo = NULL; } +static inline enum xfs_metafile_type +xfs_qm_metafile_type( + unsigned int flags) +{ + if (flags & XFS_QMOPT_UQUOTA) + return XFS_METAFILE_USRQUOTA; + else if (flags & XFS_QMOPT_GQUOTA) + return XFS_METAFILE_GRPQUOTA; + return XFS_METAFILE_PRJQUOTA; +} + /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes @@ -744,6 +972,7 @@ xfs_qm_qino_alloc( unsigned int flags) { struct xfs_trans *tp; + enum xfs_metafile_type metafile_type = xfs_qm_metafile_type(flags); int error; bool need_alloc = true; @@ -777,9 +1006,10 @@ xfs_qm_qino_alloc( } } if (ino != NULLFSINO) { - error = xfs_iget(mp, NULL, ino, 0, 0, ipp); + error = xfs_metafile_iget(mp, ino, metafile_type, ipp); if (error) return error; + mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; need_alloc = false; @@ -806,6 +1036,8 @@ xfs_qm_qino_alloc( xfs_trans_cancel(tp); return error; } + if (xfs_has_metadir(mp)) + xfs_metafile_set_iflag(tp, *ipp, metafile_type); } /* @@ -1108,6 +1340,10 @@ xfs_qm_quotacheck_dqadjust( return error; } + error = xfs_dquot_attach_buf(NULL, dqp); + if (error) + return error; + trace_xfs_dqadjust(dqp); /* @@ -1153,8 +1389,8 @@ xfs_qm_dqusage_adjust( void *data) { struct xfs_inode *ip; - xfs_qcnt_t nblks; - xfs_filblks_t rtblks = 0; /* total rt blks */ + xfs_filblks_t nblks, rtblks; + unsigned int lock_mode; int error; ASSERT(XFS_IS_QUOTA_ON(mp)); @@ -1189,20 +1425,23 @@ xfs_qm_dqusage_adjust( } } + /* Metadata directory files are not accounted to user-visible quotas. */ + if (xfs_is_metadir_inode(ip)) + goto error0; + ASSERT(ip->i_delayed_blks == 0); + lock_mode = xfs_ilock_data_map_shared(ip); if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); - if (error) + if (error) { + xfs_iunlock(ip, lock_mode); goto error0; - - xfs_bmap_count_leaves(ifp, &rtblks); + } } - - nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; + xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); + xfs_iunlock(ip, lock_mode); /* * Add the (disk blocks and inode) resources occupied by this @@ -1287,11 +1526,17 @@ xfs_qm_flush_one( goto out_unlock; } - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); if (error) goto out_unlock; + if (!bp) { + error = -EFSCORRUPTED; + goto out_unlock; + } - xfs_buf_delwri_queue(bp, buffer_list); + error = xfs_qm_dqflush(dqp, bp); + if (!error) + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); @@ -1462,10 +1707,12 @@ xfs_qm_mount_quotas( uint sbf; /* - * If quotas on realtime volumes is not supported, we disable - * quotas immediately. + * If quotas on realtime volumes is not supported, disable quotas + * immediately. We only support rtquota if rtgroups are enabled to + * avoid problems with older kernels. */ - if (mp->m_sb.sb_rextents) { + if (mp->m_sb.sb_rextents && + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; @@ -1533,7 +1780,7 @@ xfs_qm_mount_quotas( } if (error) { - xfs_warn(mp, "Failed to initialize disk quotas."); + xfs_warn(mp, "Failed to initialize disk quotas, err %d.", error); return; } } @@ -1552,27 +1799,26 @@ xfs_qm_qino_load( xfs_dqtype_t type, struct xfs_inode **ipp) { - xfs_ino_t ino = NULLFSINO; - - switch (type) { - case XFS_DQTYPE_USER: - ino = mp->m_sb.sb_uquotino; - break; - case XFS_DQTYPE_GROUP: - ino = mp->m_sb.sb_gquotino; - break; - case XFS_DQTYPE_PROJ: - ino = mp->m_sb.sb_pquotino; - break; - default: - ASSERT(0); - return -EFSCORRUPTED; - } - - if (ino == NULLFSINO) - return -ENOENT; - - return xfs_iget(mp, NULL, ino, 0, 0, ipp); + struct xfs_trans *tp; + struct xfs_inode *dp = NULL; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + if (xfs_has_metadir(mp)) { + error = xfs_dqinode_load_parent(tp, &dp); + if (error) + goto out_cancel; + } + + error = xfs_dqinode_load(tp, dp, type, ipp); + if (dp) + xfs_irele(dp); +out_cancel: + xfs_trans_cancel(tp); + return error; } /* @@ -1666,24 +1912,6 @@ error_rele: } STATIC void -xfs_qm_destroy_quotainos( - struct xfs_quotainfo *qi) -{ - if (qi->qi_uquotaip) { - xfs_irele(qi->qi_uquotaip); - qi->qi_uquotaip = NULL; /* paranoia */ - } - if (qi->qi_gquotaip) { - xfs_irele(qi->qi_gquotaip); - qi->qi_gquotaip = NULL; - } - if (qi->qi_pquotaip) { - xfs_irele(qi->qi_pquotaip); - qi->qi_pquotaip = NULL; - } -} - -STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { @@ -1735,6 +1963,8 @@ xfs_qm_vop_dqalloc( if (!XFS_IS_QUOTA_ON(mp)) return 0; + ASSERT(!xfs_is_metadir_inode(ip)); + lockflags = XFS_ILOCK_EXCL; xfs_ilock(ip, lockflags); @@ -1858,23 +2088,29 @@ xfs_qm_vop_chown( struct xfs_dquot *newdq) { struct xfs_dquot *prevdq; - uint bfield = XFS_IS_REALTIME_INODE(ip) ? - XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - + xfs_filblks_t dblocks, rblocks; + bool isrt = XFS_IS_REALTIME_INODE(ip); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); + ASSERT(!xfs_is_metadir_inode(ip)); /* old dquot */ prevdq = *IO_olddq; ASSERT(prevdq); ASSERT(prevdq != newdq); - xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks)); + xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks); + + xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_BCOUNT, + -(xfs_qcnt_t)dblocks); + xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_RTBCOUNT, + -(xfs_qcnt_t)rblocks); xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ - xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks); + xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_BCOUNT, dblocks); + xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_RTBCOUNT, rblocks); xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* @@ -1884,7 +2120,8 @@ xfs_qm_vop_chown( * (having already bumped up the real counter) so that we don't have * any reservation to give back when we commit. */ - xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, + xfs_trans_mod_dquot(tp, newdq, + isrt ? XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, -ip->i_delayed_blks); /* @@ -1896,8 +2133,13 @@ xfs_qm_vop_chown( */ tp->t_flags |= XFS_TRANS_DIRTY; xfs_dqlock(prevdq); - ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); - prevdq->q_blk.reserved -= ip->i_delayed_blks; + if (isrt) { + ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks); + prevdq->q_rtb.reserved -= ip->i_delayed_blks; + } else { + ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); + prevdq->q_blk.reserved -= ip->i_delayed_blks; + } xfs_dqunlock(prevdq); /* @@ -1951,6 +2193,7 @@ xfs_qm_vop_create_dqattach( return; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + ASSERT(!xfs_is_metadir_inode(ip)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); @@ -1981,6 +2224,8 @@ xfs_inode_near_dquot_enforcement( xfs_dqtype_t type) { struct xfs_dquot *dqp; + struct xfs_dquot_res *res; + struct xfs_dquot_pre *pre; int64_t freesp; /* We only care for quotas that are enabled and enforced. */ @@ -1989,21 +2234,30 @@ xfs_inode_near_dquot_enforcement( return false; if (xfs_dquot_res_over_limits(&dqp->q_ino) || + xfs_dquot_res_over_limits(&dqp->q_blk) || xfs_dquot_res_over_limits(&dqp->q_rtb)) return true; + if (XFS_IS_REALTIME_INODE(ip)) { + res = &dqp->q_rtb; + pre = &dqp->q_rtb_prealloc; + } else { + res = &dqp->q_blk; + pre = &dqp->q_blk_prealloc; + } + /* For space on the data device, check the various thresholds. */ - if (!dqp->q_prealloc_hi_wmark) + if (!pre->q_prealloc_hi_wmark) return false; - if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark) + if (res->reserved < pre->q_prealloc_lo_wmark) return false; - if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark) + if (res->reserved >= pre->q_prealloc_hi_wmark) return true; - freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved; - if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT]) + freesp = pre->q_prealloc_hi_wmark - res->reserved; + if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT]) return true; return false; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index e919c7f62f57..35b64bc3a7a8 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -55,6 +55,7 @@ struct xfs_quotainfo { struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; int qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index a11436579877..245d754f382a 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -19,28 +19,40 @@ STATIC void xfs_fill_statvfs_from_dquot( struct kstatfs *statp, + struct xfs_inode *ip, struct xfs_dquot *dqp) { + struct xfs_dquot_res *blkres = &dqp->q_blk; uint64_t limit; - limit = dqp->q_blk.softlimit ? - dqp->q_blk.softlimit : - dqp->q_blk.hardlimit; - if (limit && statp->f_blocks > limit) { - statp->f_blocks = limit; - statp->f_bfree = statp->f_bavail = - (statp->f_blocks > dqp->q_blk.reserved) ? - (statp->f_blocks - dqp->q_blk.reserved) : 0; + if (XFS_IS_REALTIME_MOUNT(ip->i_mount) && + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) + blkres = &dqp->q_rtb; + + limit = blkres->softlimit ? + blkres->softlimit : + blkres->hardlimit; + if (limit) { + uint64_t remaining = 0; + + if (limit > blkres->reserved) + remaining = limit - blkres->reserved; + + statp->f_blocks = min(statp->f_blocks, limit); + statp->f_bfree = min(statp->f_bfree, remaining); } limit = dqp->q_ino.softlimit ? dqp->q_ino.softlimit : dqp->q_ino.hardlimit; - if (limit && statp->f_files > limit) { - statp->f_files = limit; - statp->f_ffree = - (statp->f_files > dqp->q_ino.reserved) ? - (statp->f_files - dqp->q_ino.reserved) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dqp->q_ino.reserved) + remaining = limit - dqp->q_ino.reserved; + + statp->f_files = min(statp->f_files, limit); + statp->f_ffree = min(statp->f_ffree, remaining); } } @@ -61,11 +73,33 @@ xfs_qm_statvfs( struct xfs_dquot *dqp; if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { - xfs_fill_statvfs_from_dquot(statp, dqp); + xfs_fill_statvfs_from_dquot(statp, ip, dqp); xfs_qm_dqput(dqp); } } +STATIC int +xfs_qm_validate_state_change( + struct xfs_mount *mp, + uint uqd, + uint gqd, + uint pqd) +{ + int state; + + /* Is quota state changing? */ + state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) || + (!uqd && XFS_IS_UQUOTA_ON(mp)) || + (gqd && !XFS_IS_GQUOTA_ON(mp)) || + (!gqd && XFS_IS_GQUOTA_ON(mp)) || + (pqd && !XFS_IS_PQUOTA_ON(mp)) || + (!pqd && XFS_IS_PQUOTA_ON(mp))); + + return state && + (xfs_dev_is_read_only(mp, "changing quota state") || + xfs_has_norecovery(mp)); +} + int xfs_qm_newmount( xfs_mount_t *mp, @@ -85,24 +119,25 @@ xfs_qm_newmount( } /* - * If the device itself is read-only, we can't allow - * the user to change the state of quota on the mount - - * this would generate a transaction on the ro device, - * which would lead to an I/O error and shutdown + * If the device itself is read-only and/or in norecovery + * mode, we can't allow the user to change the state of + * quota on the mount - this would generate a transaction + * on the ro device, which would lead to an I/O error and + * shutdown. */ - if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || - (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && - xfs_dev_is_read_only(mp, "changing quota state")) { - xfs_warn(mp, "please mount with%s%s%s%s.", - (!quotaondisk ? "out quota" : ""), - (uquotaondisk ? " usrquota" : ""), - (gquotaondisk ? " grpquota" : ""), - (pquotaondisk ? " prjquota" : "")); + if (xfs_qm_validate_state_change(mp, uquotaondisk, + gquotaondisk, pquotaondisk)) { + + if (xfs_has_metadir(mp)) + xfs_warn(mp, + "metadir enabled, please mount without any quota mount options"); + else + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); return -EPERM; } @@ -135,3 +170,21 @@ xfs_qm_newmount( return 0; } + +/* + * If the sysadmin didn't provide any quota mount options, restore the quota + * accounting and enforcement state from the ondisk superblock. Only do this + * for metadir filesystems because this is a behavior change. + */ +void +xfs_qm_resume_quotaon( + struct xfs_mount *mp) +{ + if (!xfs_has_metadir(mp)) + return; + if (xfs_has_norecovery(mp)) + return; + + mp->m_qflags = mp->m_sb.sb_qflags & (XFS_ALL_QUOTA_ACCT | + XFS_ALL_QUOTA_ENFD); +} diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 4eda50ae2d1c..0c78f30fa4a3 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -427,19 +427,6 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; } - -#ifdef DEBUG - if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) { - if ((dst->d_space > dst->d_spc_softlimit) && - (dst->d_spc_softlimit > 0)) { - ASSERT(dst->d_spc_timer != 0); - } - if ((dst->d_ino_count > dqp->q_ino.softlimit) && - (dqp->q_ino.softlimit > 0)) { - ASSERT(dst->d_ino_timer != 0); - } - } -#endif } /* Return the quota information for the dquot matching id. */ diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 23d71a55bbc0..105e6eb57620 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -96,7 +96,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *); extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); -extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp, + bool already_locked); int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, bool force); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, @@ -120,10 +121,12 @@ extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); +void xfs_qm_resume_quotaon(struct xfs_mount *mp); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); +int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks); # ifdef CONFIG_XFS_LIVE_HOOKS void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip, @@ -166,7 +169,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, { } #define xfs_trans_apply_dquot_deltas(tp) -#define xfs_trans_unreserve_and_mod_dquots(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp, a) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, bool force) @@ -197,11 +200,17 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_dqrele(d) do { (d) = (d); } while(0) #define xfs_qm_statvfs(ip, s) do { } while(0) #define xfs_qm_newmount(mp, a, b) (0) +#define xfs_qm_resume_quotaon(mp) ((void)0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) #define xfs_inode_near_dquot_enforcement(ip, type) (false) +static inline int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return 0; +} + # ifdef CONFIG_XFS_LIVE_HOOKS # define xfs_dqtrx_hook_enable() ((void)0) # define xfs_dqtrx_hook_disable() ((void)0) @@ -209,12 +218,6 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #endif /* CONFIG_XFS_QUOTA */ -static inline int -xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) -{ - return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); -} - static inline void xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks) { diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 27398512b179..076501123d89 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -23,6 +23,7 @@ #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_rtgroup.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -77,6 +78,11 @@ xfs_cui_item_size( *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); } +unsigned int xfs_cui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_cui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given cui log item. We use only 1 iovec, and we point that @@ -94,8 +100,9 @@ xfs_cui_item_format( ASSERT(atomic_read(&cuip->cui_next_extent) == cuip->cui_format.cui_nextents); + ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT); - cuip->cui_format.cui_type = XFS_LI_CUI; + cuip->cui_format.cui_type = lip->li_type; cuip->cui_format.cui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, @@ -138,12 +145,14 @@ xfs_cui_item_release( STATIC struct xfs_cui_log_item * xfs_cui_init( struct xfs_mount *mp, + unsigned short item_type, uint nextents) - { struct xfs_cui_log_item *cuip; ASSERT(nextents > 0); + ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) cuip = kzalloc(xfs_cui_log_item_sizeof(nextents), GFP_KERNEL | __GFP_NOFAIL); @@ -151,7 +160,7 @@ xfs_cui_init( cuip = kmem_cache_zalloc(xfs_cui_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; atomic_set(&cuip->cui_next_extent, 0); @@ -175,6 +184,11 @@ xfs_cud_item_size( *nbytes += sizeof(struct xfs_cud_log_format); } +unsigned int xfs_cud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_cud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given cud log item. We use only 1 iovec, and we point that @@ -190,7 +204,9 @@ xfs_cud_item_format( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); struct xfs_log_iovec *vecp = NULL; - cudp->cud_format.cud_type = XFS_LI_CUD; + ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT); + + cudp->cud_format.cud_type = lip->li_type; cudp->cud_format.cud_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, @@ -234,6 +250,14 @@ static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e) return list_entry(e, struct xfs_refcount_intent, ri_list); } +static inline bool +xfs_cui_item_isrt(const struct xfs_log_item *lip) +{ + ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT); + + return lip->li_type == XFS_LI_CUI_RT; +} + /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -244,7 +268,7 @@ xfs_refcount_update_diff_items( struct xfs_refcount_intent *ra = ci_entry(a); struct xfs_refcount_intent *rb = ci_entry(b); - return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; + return ra->ri_group->xg_gno - rb->ri_group->xg_gno; } /* Log refcount updates in the intent item. */ @@ -282,18 +306,20 @@ xfs_refcount_update_log_item( } static struct xfs_log_item * -xfs_refcount_update_create_intent( +__xfs_refcount_update_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, - bool sort) + bool sort, + unsigned short item_type) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_cui_log_item *cuip; struct xfs_refcount_intent *ri; ASSERT(count > 0); + cuip = xfs_cui_init(mp, item_type, count); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -301,6 +327,23 @@ xfs_refcount_update_create_intent( return &cuip->cui_item; } +static struct xfs_log_item * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_refcount_update_create_intent(tp, items, count, sort, + XFS_LI_CUI); +} + +static inline unsigned short +xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip) +{ + return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD; +} + /* Get an CUD so we can process all the deferred refcount updates. */ static struct xfs_log_item * xfs_refcount_update_create_done( @@ -312,8 +355,8 @@ xfs_refcount_update_create_done( struct xfs_cud_log_item *cudp; cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, + xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops); cudp->cud_cuip = cuip; cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; @@ -328,10 +371,20 @@ xfs_refcount_defer_add( { struct xfs_mount *mp = tp->t_mountp; - trace_xfs_refcount_defer(mp, ri); + /* + * Deferred refcount updates for the realtime and data sections must + * use separate transactions to finish deferred work because updates to + * realtime metadata files can lock AGFs to allocate btree blocks and + * we don't want that mixing with the AGF locks taken to finish data + * section updates. + */ + ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, + ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG); - ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock); - xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); + trace_xfs_refcount_defer(mp, ri); + xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ? + &xfs_rtrefcount_update_defer_type : + &xfs_refcount_update_defer_type); } /* Cancel a deferred refcount update. */ @@ -341,7 +394,7 @@ xfs_refcount_update_cancel_item( { struct xfs_refcount_intent *ri = ci_entry(item); - xfs_perag_intent_put(ri->ri_pag); + xfs_group_intent_put(ri->ri_group); kmem_cache_free(xfs_refcount_intent_cache, ri); } @@ -381,7 +434,7 @@ xfs_refcount_finish_one_cleanup( return; agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); - if (error) + if (error && agbp) xfs_trans_brelse(tp, agbp); } @@ -397,6 +450,7 @@ xfs_refcount_update_abort_intent( static inline bool xfs_cui_validate_phys( struct xfs_mount *mp, + bool isrt, struct xfs_phys_extent *pmap) { if (!xfs_has_reflink(mp)) @@ -415,6 +469,9 @@ xfs_cui_validate_phys( return false; } + if (isrt) + return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len); + return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } @@ -422,6 +479,7 @@ static inline void xfs_cui_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + bool isrt, struct xfs_phys_extent *pmap) { struct xfs_refcount_intent *ri; @@ -431,7 +489,9 @@ xfs_cui_recover_work( ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; - ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock); + ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock, + isrt ? XG_TYPE_RTG : XG_TYPE_AG); + ri->ri_realtime = isrt; xfs_defer_add_item(dfp, &ri->ri_list); } @@ -450,6 +510,7 @@ xfs_refcount_recover_work( struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_trans *tp; struct xfs_mount *mp = lip->li_log->l_mp; + bool isrt = xfs_cui_item_isrt(lip); int i; int error = 0; @@ -459,7 +520,7 @@ xfs_refcount_recover_work( * just toss the CUI. */ for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - if (!xfs_cui_validate_phys(mp, + if (!xfs_cui_validate_phys(mp, isrt, &cuip->cui_format.cui_extents[i])) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, @@ -467,7 +528,8 @@ xfs_refcount_recover_work( return -EFSCORRUPTED; } - xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); + xfs_cui_recover_work(mp, dfp, isrt, + &cuip->cui_format.cui_extents[i]); } /* @@ -514,10 +576,13 @@ xfs_refcount_relog_intent( struct xfs_phys_extent *pmap; unsigned int count; + ASSERT(intent->li_type == XFS_LI_CUI || + intent->li_type == XFS_LI_CUI_RT); + count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - cuip = xfs_cui_init(tp->t_mountp, count); + cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); @@ -537,6 +602,71 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .relog_intent = xfs_refcount_relog_intent, }; +#ifdef CONFIG_XFS_RT +static struct xfs_log_item * +xfs_rtrefcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_refcount_update_create_intent(tp, items, count, sort, + XFS_LI_CUI_RT); +} + +/* Process a deferred realtime refcount update. */ +STATIC int +xfs_rtrefcount_update_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_refcount_intent *ri = ci_entry(item); + int error; + + error = xfs_rtrefcount_finish_one(tp, ri, state); + + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && ri->ri_blockcount > 0) { + ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || + ri->ri_type == XFS_REFCOUNT_DECREASE); + return -EAGAIN; + } + + xfs_refcount_update_cancel_item(item); + return error; +} + +/* Clean up after calling xfs_rtrefcount_finish_one. */ +STATIC void +xfs_rtrefcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + if (rcur) + xfs_btree_del_cursor(rcur, error); +} + +const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = { + .name = "rtrefcount", + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rtrefcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_rtrefcount_update_finish_item, + .finish_cleanup = xfs_rtrefcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; +#else +const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = { + .name = "rtrefcount", +}; +#endif /* CONFIG_XFS_RT */ + STATIC bool xfs_cui_item_match( struct xfs_log_item *lip, @@ -602,7 +732,7 @@ xlog_recover_cui_commit_pass2( return -EFSCORRUPTED; } - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); @@ -616,6 +746,61 @@ const struct xlog_recover_item_ops xlog_cui_item_ops = { .commit_pass2 = xlog_recover_cui_commit_pass2, }; +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtcui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + size_t len; + + cui_formatp = item->ri_buf[0].i_addr; + + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + &xfs_rtrefcount_update_defer_type); + return 0; +} +#else +STATIC int +xlog_recover_rtcui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; +} +#endif + +const struct xlog_recover_item_ops xlog_rtcui_item_ops = { + .item_type = XFS_LI_CUI_RT, + .commit_pass2 = xlog_recover_rtcui_commit_pass2, +}; + /* * This routine is called when an CUD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding CUI if it @@ -647,3 +832,33 @@ const struct xlog_recover_item_ops xlog_cud_item_ops = { .item_type = XFS_LI_CUD, .commit_pass2 = xlog_recover_cud_commit_pass2, }; + +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtcud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_format *cud_formatp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_CUI_RT, + cud_formatp->cud_cui_id); + return 0; +} +#else +# define xlog_recover_rtcud_commit_pass2 xlog_recover_rtcui_commit_pass2 +#endif + +const struct xlog_recover_item_ops xlog_rtcud_item_ops = { + .item_type = XFS_LI_CUD_RT, + .commit_pass2 = xlog_recover_rtcud_commit_pass2, +}; diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index bfee8f30c63c..0fc3f493342b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -76,4 +76,7 @@ struct xfs_refcount_intent; void xfs_refcount_defer_add(struct xfs_trans *tp, struct xfs_refcount_intent *ri); +unsigned int xfs_cui_log_space(unsigned int nr); +unsigned int xfs_cud_log_space(void); + #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5bf6682e701b..ad3bcb76d805 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -30,6 +30,10 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" /* * Copy on Write of Shared Blocks @@ -120,38 +124,93 @@ */ /* - * Given an AG extent, find the lowest-numbered run of shared blocks - * within that range and return the range in fbno/flen. If - * find_end_of_shared is true, return the longest contiguous extent of - * shared blocks. If there are no shared extents, fbno and flen will - * be set to NULLAGBLOCK and 0, respectively. + * Given a file mapping for the data device, find the lowest-numbered run of + * shared blocks within that mapping and return it in shared_offset/shared_len. + * The offset is relative to the start of irec. + * + * If find_end_of_shared is true, return the longest contiguous extent of shared + * blocks. If there are no shared extents, shared_offset and shared_len will be + * set to 0; */ static int xfs_reflink_find_shared( - struct xfs_perag *pag, + struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agblock_t agbno, - xfs_extlen_t aglen, - xfs_agblock_t *fbno, - xfs_extlen_t *flen, + const struct xfs_bmbt_irec *irec, + xfs_extlen_t *shared_offset, + xfs_extlen_t *shared_len, bool find_end_of_shared) { struct xfs_buf *agbp; + struct xfs_perag *pag; struct xfs_btree_cur *cur; int error; + xfs_agblock_t orig_bno, found_bno; + + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) - return error; + goto out; + + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, + &found_bno, shared_len, find_end_of_shared); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(tp, agbp); + + if (!error && *shared_len) + *shared_offset = found_bno - orig_bno; +out: + xfs_perag_put(pag); + return error; +} - cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag); +/* + * Given a file mapping for the rt device, find the lowest-numbered run of + * shared blocks within that mapping and return it in shared_offset/shared_len. + * The offset is relative to the start of irec. + * + * If find_end_of_shared is true, return the longest contiguous extent of shared + * blocks. If there are no shared extents, shared_offset and shared_len will be + * set to 0; + */ +static int +xfs_reflink_find_rtshared( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_bmbt_irec *irec, + xfs_extlen_t *shared_offset, + xfs_extlen_t *shared_len, + bool find_end_of_shared) +{ + struct xfs_rtgroup *rtg; + struct xfs_btree_cur *cur; + xfs_rgblock_t orig_bno; + xfs_agblock_t found_bno; + int error; - error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, - find_end_of_shared); + BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK); + /* + * Note: this uses the not quite correct xfs_agblock_t type because + * xfs_refcount_find_shared is shared between the RT and data device + * refcount code. + */ + orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock); + rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock)); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); + cur = xfs_rtrefcountbt_init_cursor(tp, rtg); + error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, + &found_bno, shared_len, find_end_of_shared); xfs_btree_del_cursor(cur, error); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT); + xfs_rtgroup_put(rtg); - xfs_trans_brelse(tp, agbp); + if (!error && *shared_len) + *shared_offset = found_bno - orig_bno; return error; } @@ -172,56 +231,48 @@ xfs_reflink_trim_around_shared( bool *shared) { struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_extlen_t shared_offset, shared_len; int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } trace_xfs_reflink_trim_around_shared(ip, irec); - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); - agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); - aglen = irec->br_blockcount; - - error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, - true); - xfs_perag_put(pag); + if (XFS_IS_REALTIME_INODE(ip)) + error = xfs_reflink_find_rtshared(mp, NULL, irec, + &shared_offset, &shared_len, true); + else + error = xfs_reflink_find_shared(mp, NULL, irec, + &shared_offset, &shared_len, true); if (error) return error; - *shared = false; - if (fbno == NULLAGBLOCK) { + if (!shared_len) { /* No shared blocks at all. */ - return 0; - } - - if (fbno == agbno) { + *shared = false; + } else if (!shared_offset) { /* - * The start of this extent is shared. Truncate the - * mapping at the end of the shared region so that a - * subsequent iteration starts at the start of the - * unshared region. + * The start of this mapping points to shared space. Truncate + * the mapping at the end of the shared region so that a + * subsequent iteration starts at the start of the unshared + * region. */ - irec->br_blockcount = flen; + irec->br_blockcount = shared_len; *shared = true; - return 0; + } else { + /* + * There's a shared region that doesn't start at the beginning + * of the mapping. Truncate the mapping at the start of the + * shared extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = shared_offset; + *shared = false; } - - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; return 0; } @@ -242,7 +293,7 @@ xfs_bmap_trim_cow( return xfs_reflink_trim_around_shared(ip, imap, shared); } -static int +int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, @@ -389,20 +440,26 @@ xfs_reflink_fill_cow_hole( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_filblks_t resaligned; - xfs_extlen_t resblks; + unsigned int dblocks = 0, rblocks = 0; int nimaps; int error; bool found; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } xfs_iunlock(ip, *lockmode); *lockmode = 0; - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, - false, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); if (error) return error; @@ -571,6 +628,7 @@ xfs_reflink_cancel_cow_blocks( struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; + bool isrt = XFS_IS_REALTIME_INODE(ip); int error = 0; if (!xfs_inode_has_cow_data(ip)) @@ -593,17 +651,18 @@ xfs_reflink_cancel_cow_blocks( if (isnullstartblock(del.br_startblock)) { xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, - &del); + &del, 0); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); /* Free the CoW orphan record. */ - xfs_refcount_free_cow_extent(*tpp, del.br_startblock, - del.br_blockcount); + xfs_refcount_free_cow_extent(*tpp, isrt, + del.br_startblock, del.br_blockcount); error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + XFS_AG_RESV_NONE, + isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) break; @@ -687,6 +746,35 @@ out: return error; } +#ifdef CONFIG_XFS_QUOTA +/* + * Update quota accounting for a remapping operation. When we're remapping + * something from the CoW fork to the data fork, we must update the quota + * accounting for delayed allocations. For remapping from the data fork to the + * data fork, use regular block accounting. + */ +static inline void +xfs_reflink_update_quota( + struct xfs_trans *tp, + struct xfs_inode *ip, + bool is_cow, + int64_t blocks) +{ + unsigned int qflag; + + if (XFS_IS_REALTIME_INODE(ip)) { + qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT; + } else { + qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT; + } + xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks); +} +#else +# define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0) +#endif + /* * Remap part of the CoW fork into the data fork. * @@ -698,34 +786,19 @@ out: * requirements as low as possible. */ STATIC int -xfs_reflink_end_cow_extent( +xfs_reflink_end_cow_extent_locked( + struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got, del, data; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); - unsigned int resblks; int nmaps; + bool isrt = XFS_IS_REALTIME_INODE(ip); int error; - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - /* - * Lock the inode. We have to ijoin without automatic unlock because - * the lead transaction is the refcountbt record deletion; the data - * fork update follows as a deferred log item. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -734,7 +807,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } /* @@ -748,7 +821,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_next_extent(ifp, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } } del = got; @@ -757,14 +830,14 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); if (error) - goto out_cancel; + return error; /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, &nmaps, 0); if (error) - goto out_cancel; + return error; /* We can only remap the smaller of the two extent sizes. */ data.br_blockcount = min(data.br_blockcount, del.br_blockcount); @@ -779,9 +852,8 @@ xfs_reflink_end_cow_extent( * or not), unmap the extent and drop its refcount. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); - xfs_refcount_decrease_extent(tp, &data); - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, - -data.br_blockcount); + xfs_refcount_decrease_extent(tp, isrt, &data); + xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount); } else if (data.br_startblock == DELAYSTARTBLOCK) { int done; @@ -794,34 +866,62 @@ xfs_reflink_end_cow_extent( error = xfs_bunmapi(NULL, ip, data.br_startoff, data.br_blockcount, 0, 1, &done); if (error) - goto out_cancel; + return error; ASSERT(done); } /* Free the CoW orphan record. */ - xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); + xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock, + del.br_blockcount); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del); /* Charge this new data fork mapping to the on-disk quota. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, - (long)del.br_blockcount); + xfs_reflink_update_quota(tp, ip, true, del.br_blockcount); /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - /* Update the caller about how much progress we made. */ *offset_fsb = del.br_startoff + del.br_blockcount; return 0; +} -out_cancel: - xfs_trans_cancel(tp); +/* + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. + */ +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + int error; + + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -885,6 +985,78 @@ xfs_reflink_end_cow( } /* + * Fully remap all of the file's data fork at once, which is the critical part + * in achieving atomic behaviour. + * The regular CoW end path does not use function as to keep the block + * reservation per transaction as low as possible. + */ +int +xfs_reflink_end_atomic_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + /* + * Each remapping operation could cause a btree split, so in the worst + * case that's one for each block. + */ + resblks = (end_fsb - offset_fsb) * + XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + while (end_fsb > offset_fsb && !error) { + error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb, + end_fsb); + } + if (error) { + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + goto out_cancel; + } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* Compute the largest atomic write that we can complete through software. */ +xfs_extlen_t +xfs_reflink_max_atomic_cow( + struct xfs_mount *mp) +{ + /* We cannot do any atomic writes without out of place writes. */ + if (!xfs_can_sw_atomic_write(mp)) + return 0; + + /* + * Atomic write limits must always be a power-of-2, according to + * generic_atomic_write_valid. + */ + return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp)); +} + +/* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the * staging extent, so callers must ensure that there are no cached inodes with @@ -894,22 +1066,30 @@ int xfs_reflink_recover_cow( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; int error = 0; if (!xfs_has_reflink(mp)) return 0; - for_each_perag(mp, agno, pag) { - error = xfs_refcount_recover_cow_leftovers(mp, pag); + while ((pag = xfs_perag_next(mp, pag))) { + error = xfs_refcount_recover_cow_leftovers(pag_group(pag)); if (error) { xfs_perag_rele(pag); - break; + return error; } } - return error; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg)); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + return 0; } /* @@ -1101,14 +1281,22 @@ out_error: static int xfs_reflink_ag_has_free_space( struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_inode *ip, + xfs_fsblock_t fsb) { struct xfs_perag *pag; + xfs_agnumber_t agno; int error = 0; if (!xfs_has_rmapbt(mp)) return 0; + if (XFS_IS_REALTIME_INODE(ip)) { + if (xfs_metafile_resv_critical(mp)) + return -ENOSPC; + return 0; + } + agno = XFS_FSB_TO_AGNO(mp, fsb); pag = xfs_perag_get(mp, agno); if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) @@ -1132,10 +1320,11 @@ xfs_reflink_remap_extent( struct xfs_trans *tp; xfs_off_t newlen; int64_t qdelta = 0; - unsigned int resblks; + unsigned int dblocks, rblocks, resblks; bool quota_reserved = true; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); + bool isrt = XFS_IS_REALTIME_INODE(ip); int iext_delta = 0; int nimaps; int error; @@ -1162,8 +1351,15 @@ xfs_reflink_remap_extent( * we're remapping. */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = resblks; + rblocks = dmap->br_blockcount; + } else { + dblocks = resblks + dmap->br_blockcount; + rblocks = 0; + } error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, - resblks + dmap->br_blockcount, 0, false, &tp); + dblocks, rblocks, false, &tp); if (error == -EDQUOT || error == -ENOSPC) { quota_reserved = false; error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, @@ -1214,8 +1410,8 @@ xfs_reflink_remap_extent( /* No reflinking if the AG of the dest mapping is low on space. */ if (dmap_written) { - error = xfs_reflink_ag_has_free_space(mp, - XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); + error = xfs_reflink_ag_has_free_space(mp, ip, + dmap->br_startblock); if (error) goto out_cancel; } @@ -1243,8 +1439,15 @@ xfs_reflink_remap_extent( * done. */ if (!quota_reserved && !smap_real && dmap_written) { - error = xfs_trans_reserve_quota_nblks(tp, ip, - dmap->br_blockcount, 0, false); + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = 0; + rblocks = dmap->br_blockcount; + } else { + dblocks = dmap->br_blockcount; + rblocks = 0; + } + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + false); if (error) goto out_cancel; } @@ -1265,7 +1468,7 @@ xfs_reflink_remap_extent( * or not), unmap the extent and drop its refcount. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap); - xfs_refcount_decrease_extent(tp, &smap); + xfs_refcount_decrease_extent(tp, isrt, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { int done; @@ -1288,12 +1491,12 @@ xfs_reflink_remap_extent( * its refcount and map it into the file. */ if (dmap_written) { - xfs_refcount_increase_extent(tp, dmap); + xfs_refcount_increase_extent(tp, isrt, dmap); xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap); qdelta += dmap->br_blockcount; } - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); + xfs_reflink_update_quota(tp, ip, false, qdelta); /* Update dest isize if needed. */ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); @@ -1413,7 +1616,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return xfs_zero_range(ip, isize, pos - isize, NULL); + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL); } /* @@ -1467,8 +1670,8 @@ xfs_reflink_remap_prep( /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; - /* Don't reflink realtime inodes */ - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + /* Can't reflink between data and rt volumes */ + if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest)) goto out_unlock; /* Don't share DAX file data with non-DAX file. */ @@ -1548,27 +1751,23 @@ xfs_reflink_inode_has_shared_extents( *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { - struct xfs_perag *pag; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; + xfs_extlen_t shared_offset, shared_len; if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) goto next; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); - agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); - aglen = got.br_blockcount; - error = xfs_reflink_find_shared(pag, tp, agbno, aglen, - &rbno, &rlen, false); - xfs_perag_put(pag); + if (XFS_IS_REALTIME_INODE(ip)) + error = xfs_reflink_find_rtshared(mp, tp, &got, + &shared_offset, &shared_len, false); + else + error = xfs_reflink_find_shared(mp, tp, &got, + &shared_offset, &shared_len, false); if (error) return error; /* Is there still a shared block here? */ - if (rbno != NULLAGBLOCK) { + if (shared_len) { *has_shared = true; return 0; } @@ -1701,3 +1900,28 @@ out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } + +/* + * Can we use reflink with this realtime extent size? Note that we don't check + * for rblocks > 0 here because this can be called as part of attaching a new + * rt section. + */ +bool +xfs_reflink_supports_rextsize( + struct xfs_mount *mp, + unsigned int rextsize) +{ + /* reflink on the realtime device requires rtgroups */ + if (!xfs_has_rtgroups(mp)) + return false; + + /* + * Reflink doesn't support rt extent size larger than a single fsblock + * because we would have to perform CoW-around for unaligned write + * requests to guarantee that we always remap entire rt extents. + */ + if (rextsize != 1) + return false; + + return true; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 4a58e4533671..36cda724da89 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -25,7 +25,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) return true; } -extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, +int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); @@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_convert_cow_locked(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, @@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, @@ -62,4 +66,8 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); +bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); + +xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp); + #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 88b5580e1e19..c99700318ec2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -23,6 +23,7 @@ #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_rtgroup.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; @@ -76,6 +77,11 @@ xfs_rui_item_size( *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } +unsigned int xfs_rui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_rui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given rui log item. We use only 1 iovec, and we point that @@ -94,7 +100,9 @@ xfs_rui_item_format( ASSERT(atomic_read(&ruip->rui_next_extent) == ruip->rui_format.rui_nextents); - ruip->rui_format.rui_type = XFS_LI_RUI; + ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT); + + ruip->rui_format.rui_type = lip->li_type; ruip->rui_format.rui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, @@ -137,12 +145,15 @@ xfs_rui_item_release( STATIC struct xfs_rui_log_item * xfs_rui_init( struct xfs_mount *mp, + unsigned short item_type, uint nextents) { struct xfs_rui_log_item *ruip; ASSERT(nextents > 0); + ASSERT(item_type == XFS_LI_RUI || item_type == XFS_LI_RUI_RT); + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) ruip = kzalloc(xfs_rui_log_item_sizeof(nextents), GFP_KERNEL | __GFP_NOFAIL); @@ -150,7 +161,7 @@ xfs_rui_init( ruip = kmem_cache_zalloc(xfs_rui_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); + xfs_log_item_init(mp, &ruip->rui_item, item_type, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; atomic_set(&ruip->rui_next_extent, 0); @@ -174,6 +185,11 @@ xfs_rud_item_size( *nbytes += sizeof(struct xfs_rud_log_format); } +unsigned int xfs_rud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_rud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given rud log item. We use only 1 iovec, and we point that @@ -189,7 +205,9 @@ xfs_rud_item_format( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); struct xfs_log_iovec *vecp = NULL; - rudp->rud_format.rud_type = XFS_LI_RUD; + ASSERT(lip->li_type == XFS_LI_RUD || lip->li_type == XFS_LI_RUD_RT); + + rudp->rud_format.rud_type = lip->li_type; rudp->rud_format.rud_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format, @@ -233,6 +251,14 @@ static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e) return list_entry(e, struct xfs_rmap_intent, ri_list); } +static inline bool +xfs_rui_item_isrt(const struct xfs_log_item *lip) +{ + ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT); + + return lip->li_type == XFS_LI_RUI_RT; +} + /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -243,7 +269,7 @@ xfs_rmap_update_diff_items( struct xfs_rmap_intent *ra = ri_entry(a); struct xfs_rmap_intent *rb = ri_entry(b); - return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; + return ra->ri_group->xg_gno - rb->ri_group->xg_gno; } /* Log rmap updates in the intent item. */ @@ -305,18 +331,20 @@ xfs_rmap_update_log_item( } static struct xfs_log_item * -xfs_rmap_update_create_intent( +__xfs_rmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, - bool sort) + bool sort, + unsigned short item_type) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); + struct xfs_rui_log_item *ruip; struct xfs_rmap_intent *ri; ASSERT(count > 0); + ruip = xfs_rui_init(mp, item_type, count); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -324,6 +352,23 @@ xfs_rmap_update_create_intent( return &ruip->rui_item; } +static struct xfs_log_item * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_rmap_update_create_intent(tp, items, count, sort, + XFS_LI_RUI); +} + +static inline unsigned short +xfs_rud_type_from_rui(const struct xfs_rui_log_item *ruip) +{ + return xfs_rui_item_isrt(&ruip->rui_item) ? XFS_LI_RUD_RT : XFS_LI_RUD; +} + /* Get an RUD so we can process all the deferred rmap updates. */ static struct xfs_log_item * xfs_rmap_update_create_done( @@ -335,8 +380,8 @@ xfs_rmap_update_create_done( struct xfs_rud_log_item *rudp; rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, - &xfs_rud_item_ops); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, + xfs_rud_type_from_rui(ruip), &xfs_rud_item_ops); rudp->rud_ruip = ruip; rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; @@ -351,10 +396,20 @@ xfs_rmap_defer_add( { struct xfs_mount *mp = tp->t_mountp; - trace_xfs_rmap_defer(mp, ri); + /* + * Deferred rmap updates for the realtime and data sections must use + * separate transactions to finish deferred work because updates to + * realtime metadata files can lock AGFs to allocate btree blocks and + * we don't want that mixing with the AGF locks taken to finish data + * section updates. + */ + ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock, + ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG); - ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock); - xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); + trace_xfs_rmap_defer(mp, ri); + xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ? + &xfs_rtrmap_update_defer_type : + &xfs_rmap_update_defer_type); } /* Cancel a deferred rmap update. */ @@ -364,7 +419,7 @@ xfs_rmap_update_cancel_item( { struct xfs_rmap_intent *ri = ri_entry(item); - xfs_perag_intent_put(ri->ri_pag); + xfs_group_intent_put(ri->ri_group); kmem_cache_free(xfs_rmap_intent_cache, ri); } @@ -414,6 +469,7 @@ xfs_rmap_update_abort_intent( static inline bool xfs_rui_validate_map( struct xfs_mount *mp, + bool isrt, struct xfs_map_extent *map) { if (!xfs_has_rmapbt(mp)) @@ -443,6 +499,9 @@ xfs_rui_validate_map( if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; + if (isrt) + return xfs_verify_rtbext(mp, map->me_startblock, map->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } @@ -450,6 +509,7 @@ static inline void xfs_rui_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + bool isrt, const struct xfs_map_extent *map) { struct xfs_rmap_intent *ri; @@ -494,7 +554,9 @@ xfs_rui_recover_work( ri->ri_bmap.br_blockcount = map->me_len; ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock); + ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, + isrt ? XG_TYPE_RTG : XG_TYPE_AG); + ri->ri_realtime = isrt; xfs_defer_add_item(dfp, &ri->ri_list); } @@ -513,6 +575,7 @@ xfs_rmap_recover_work( struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_trans *tp; struct xfs_mount *mp = lip->li_log->l_mp; + bool isrt = xfs_rui_item_isrt(lip); int i; int error = 0; @@ -522,7 +585,7 @@ xfs_rmap_recover_work( * just toss the RUI. */ for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - if (!xfs_rui_validate_map(mp, + if (!xfs_rui_validate_map(mp, isrt, &ruip->rui_format.rui_extents[i])) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &ruip->rui_format, @@ -530,7 +593,8 @@ xfs_rmap_recover_work( return -EFSCORRUPTED; } - xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); + xfs_rui_recover_work(mp, dfp, isrt, + &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -565,10 +629,13 @@ xfs_rmap_relog_intent( struct xfs_map_extent *map; unsigned int count; + ASSERT(intent->li_type == XFS_LI_RUI || + intent->li_type == XFS_LI_RUI_RT); + count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; - ruip = xfs_rui_init(tp->t_mountp, count); + ruip = xfs_rui_init(tp->t_mountp, intent->li_type, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); @@ -588,6 +655,47 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .relog_intent = xfs_rmap_relog_intent, }; +#ifdef CONFIG_XFS_RT +static struct xfs_log_item * +xfs_rtrmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_rmap_update_create_intent(tp, items, count, sort, + XFS_LI_RUI_RT); +} + +/* Clean up after calling xfs_rmap_finish_one. */ +STATIC void +xfs_rtrmap_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + if (rcur) + xfs_btree_del_cursor(rcur, error); +} + +const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = { + .name = "rtrmap", + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rtrmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rtrmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, + .relog_intent = xfs_rmap_relog_intent, +}; +#else +const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = { + .name = "rtrmap", +}; +#endif + STATIC bool xfs_rui_item_match( struct xfs_log_item *lip, @@ -653,7 +761,7 @@ xlog_recover_rui_commit_pass2( return -EFSCORRUPTED; } - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); @@ -667,6 +775,61 @@ const struct xlog_recover_item_ops xlog_rui_item_ops = { .commit_pass2 = xlog_recover_rui_commit_pass2, }; +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtrui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_rui_log_item *ruip; + struct xfs_rui_log_format *rui_formatp; + size_t len; + + rui_formatp = item->ri_buf[0].i_addr; + + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents); + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); + + xlog_recover_intent_item(log, &ruip->rui_item, lsn, + &xfs_rtrmap_update_defer_type); + return 0; +} +#else +STATIC int +xlog_recover_rtrui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; +} +#endif + +const struct xlog_recover_item_ops xlog_rtrui_item_ops = { + .item_type = XFS_LI_RUI_RT, + .commit_pass2 = xlog_recover_rtrui_commit_pass2, +}; + /* * This routine is called when an RUD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding RUI if it @@ -698,3 +861,33 @@ const struct xlog_recover_item_ops xlog_rud_item_ops = { .item_type = XFS_LI_RUD, .commit_pass2 = xlog_recover_rud_commit_pass2, }; + +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtrud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_rud_log_format *rud_formatp; + + rud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + rud_formatp, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_RUI_RT, + rud_formatp->rud_rui_id); + return 0; +} +#else +# define xlog_recover_rtrud_commit_pass2 xlog_recover_rtrui_commit_pass2 +#endif + +const struct xlog_recover_item_ops xlog_rtrud_item_ops = { + .item_type = XFS_LI_RUD_RT, + .commit_pass2 = xlog_recover_rtrud_commit_pass2, +}; diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 40d331555675..3a99f0117f2d 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -75,4 +75,7 @@ struct xfs_rmap_intent; void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); +unsigned int xfs_rui_log_space(unsigned int nr); +unsigned int xfs_rud_log_space(void); + #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 3a2005a1e673..6484c596ecea 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -22,9 +22,18 @@ #include "xfs_rtalloc.h" #include "xfs_sb.h" #include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" #include "xfs_quota.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_da_format.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_reflink.h" +#include "xfs_zone_alloc.h" /* * Return whether there are any free extents in the size range given @@ -38,14 +47,14 @@ xfs_rtany_summary( xfs_fileoff_t bbno, /* bitmap block number */ int *maxlog) /* out: max log2 extent size free */ { - struct xfs_mount *mp = args->mp; + uint8_t *rsum_cache = args->rtg->rtg_rsum_cache; int error; int log; /* loop counter, log2 of ext. size */ xfs_suminfo_t sum; /* summary data */ - /* There are no extents at levels >= m_rsum_cache[bbno]. */ - if (mp->m_rsum_cache) { - high = min(high, mp->m_rsum_cache[bbno] - 1); + /* There are no extents at levels >= rsum_cache[bbno]. */ + if (rsum_cache) { + high = min(high, rsum_cache[bbno] - 1); if (low > high) { *maxlog = -1; return 0; @@ -77,12 +86,11 @@ xfs_rtany_summary( *maxlog = -1; out: /* There were no extents at levels > log. */ - if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; + if (rsum_cache && log + 1 < rsum_cache[bbno]) + rsum_cache[bbno] = log + 1; return 0; } - /* * Copy and transform the summary file, given the old and new * parameters in the mount structures. @@ -149,7 +157,7 @@ xfs_rtallocate_range( /* * Find the next allocated block (end of free extent). */ - error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1, &postblock); if (error) return error; @@ -211,14 +219,14 @@ xfs_rtalloc_align_len( */ static inline xfs_rtxlen_t xfs_rtallocate_clamp_len( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rtxnum_t startrtx, xfs_rtxlen_t rtxlen, xfs_rtxlen_t prod) { xfs_rtxlen_t ret; - ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; + ret = min(rtg->rtg_extents, startrtx + rtxlen) - startrtx; return xfs_rtalloc_align_len(ret, prod); } @@ -253,10 +261,11 @@ xfs_rtallocate_extent_block( * Loop over all the extents starting in this bitmap block up to the * end of the rt volume, looking for one that's long enough. */ - end = min(mp->m_sb.sb_rextents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - 1; + end = min(args->rtg->rtg_extents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - + 1; for (i = xfs_rbmblock_to_rtx(mp, bbno); i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - scanlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); + scanlen = xfs_rtallocate_clamp_len(args->rtg, i, maxlen, prod); if (scanlen < minlen) break; @@ -341,7 +350,6 @@ xfs_rtallocate_extent_exact( xfs_rtxlen_t prod, /* extent product factor */ xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - struct xfs_mount *mp = args->mp; xfs_rtxnum_t next; /* next rtext to try (dummy) */ xfs_rtxlen_t alloclen; /* candidate length */ xfs_rtxlen_t scanlen; /* number of free rtx to look for */ @@ -352,7 +360,7 @@ xfs_rtallocate_extent_exact( ASSERT(maxlen % prod == 0); /* Make sure we don't run off the end of the rt volume. */ - scanlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); + scanlen = xfs_rtallocate_clamp_len(args->rtg, start, maxlen, prod); if (scanlen < minlen) return -ENOSPC; @@ -413,11 +421,10 @@ xfs_rtallocate_extent_near( ASSERT(maxlen % prod == 0); /* - * If the block number given is off the end, silently set it to - * the last block. + * If the block number given is off the end, silently set it to the last + * block. */ - if (start >= mp->m_sb.sb_rextents) - start = mp->m_sb.sb_rextents - 1; + start = min(start, args->rtg->rtg_extents - 1); /* * Try the exact allocation first. @@ -589,7 +596,7 @@ xfs_rtalloc_sumlevel( * specified. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int +static int xfs_rtallocate_extent_size( struct xfs_rtalloc_args *args, xfs_rtxlen_t minlen, /* minimum length to allocate */ @@ -649,19 +656,31 @@ xfs_rtallocate_extent_size( return -ENOSPC; } +static void +xfs_rtunmount_rtg( + struct xfs_rtgroup *rtg) +{ + int i; + + for (i = 0; i < XFS_RTGI_MAX; i++) + xfs_rtginode_irele(&rtg->rtg_inodes[i]); + if (!xfs_has_zoned(rtg_mount(rtg))) + kvfree(rtg->rtg_rsum_cache); +} + static int xfs_alloc_rsum_cache( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_extlen_t rbmblocks) { /* * The rsum cache is initialized to the maximum value, which is * trivially an upper bound on the maximum level with any free extents. */ - mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); - if (!mp->m_rsum_cache) + rtg->rtg_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); + if (!rtg->rtg_rsum_cache) return -ENOMEM; - memset(mp->m_rsum_cache, -1, rbmblocks); + memset(rtg->rtg_rsum_cache, -1, rbmblocks); return 0; } @@ -698,44 +717,260 @@ out_iolock: return error; } +/* Ensure that the rtgroup metadata inode is loaded, creating it if neeeded. */ +static int +xfs_rtginode_ensure( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + struct xfs_trans *tp; + int error; + + if (rtg->rtg_inodes[type]) + return 0; + + error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp); + if (error) + return error; + error = xfs_rtginode_load(rtg, type, tp); + xfs_trans_cancel(tp); + + if (error != -ENOENT) + return 0; + return xfs_rtginode_create(rtg, type, true); +} + +static struct xfs_mount * +xfs_growfs_rt_alloc_fake_mount( + const struct xfs_mount *mp, + xfs_rfsblock_t rblocks, + xfs_agblock_t rextsize) +{ + struct xfs_mount *nmp; + + nmp = kmemdup(mp, sizeof(*mp), GFP_KERNEL); + if (!nmp) + return NULL; + xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb, rextsize); + nmp->m_sb.sb_rblocks = rblocks; + nmp->m_sb.sb_rextents = xfs_blen_to_rtbxlen(nmp, nmp->m_sb.sb_rblocks); + nmp->m_sb.sb_rbmblocks = xfs_rtbitmap_blockcount(nmp); + nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents); + if (xfs_has_rtgroups(nmp)) + nmp->m_sb.sb_rgcount = howmany_64(nmp->m_sb.sb_rextents, + nmp->m_sb.sb_rgextents); + else + nmp->m_sb.sb_rgcount = 1; + nmp->m_rsumblocks = xfs_rtsummary_blockcount(nmp, &nmp->m_rsumlevels); + + if (rblocks > 0) + nmp->m_features |= XFS_FEAT_REALTIME; + + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + return nmp; +} + +/* Free all the new space and return the number of extents actually freed. */ +static int +xfs_growfs_rt_free_new( + struct xfs_rtgroup *rtg, + struct xfs_rtalloc_args *nargs, + xfs_rtbxlen_t *freed_rtx) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgnumber_t rgno = rtg_rgno(rtg); + xfs_rtxnum_t start_rtx = 0, end_rtx; + + if (rgno < mp->m_sb.sb_rgcount) + start_rtx = xfs_rtgroup_extents(mp, rgno); + end_rtx = xfs_rtgroup_extents(nargs->mp, rgno); + + /* + * Compute the first new extent that we want to free, being careful to + * skip past a realtime superblock at the start of the realtime volume. + */ + if (xfs_has_rtsb(nargs->mp) && rgno == 0 && start_rtx == 0) + start_rtx++; + *freed_rtx = end_rtx - start_rtx; + return xfs_rtfree_range(nargs, start_rtx, *freed_rtx); +} + +static xfs_rfsblock_t +xfs_growfs_rt_nrblocks( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize, + xfs_fileoff_t bmbno) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rfsblock_t step; + + step = (bmbno + 1) * mp->m_rtx_per_rbmblock * rextsize; + if (xfs_has_rtgroups(mp)) { + xfs_rfsblock_t rgblocks = mp->m_sb.sb_rgextents * rextsize; + + step = min(rgblocks, step) + rgblocks * rtg_rgno(rtg); + } + + return min(nrblocks, step); +} + +/* + * If the post-grow filesystem will have an rtsb; we're initializing the first + * rtgroup; and the filesystem didn't have a realtime section, write the rtsb + * now, and attach the rtsb buffer to the real mount. + */ +static int +xfs_growfs_rt_init_rtsb( + const struct xfs_rtalloc_args *nargs, + const struct xfs_rtgroup *rtg, + const struct xfs_rtalloc_args *args) +{ + struct xfs_mount *mp = args->mp; + struct xfs_buf *rtsb_bp; + int error; + + if (!xfs_has_rtsb(nargs->mp)) + return 0; + if (rtg_rgno(rtg) > 0) + return 0; + if (mp->m_sb.sb_rblocks) + return 0; + + error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1), + &rtsb_bp); + if (error) + return error; + + rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR; + rtsb_bp->b_ops = &xfs_rtsb_buf_ops; + + xfs_update_rtsb(rtsb_bp, mp->m_sb_bp); + mp->m_rtsb_bp = rtsb_bp; + error = xfs_bwrite(rtsb_bp); + xfs_buf_unlock(rtsb_bp); + if (error) + return error; + + /* Initialize the rtrmap to reflect the rtsb. */ + if (rtg_rmap(args->rtg) != NULL) + error = xfs_rtrmapbt_init_rtsb(nargs->mp, args->rtg, args->tp); + + return error; +} + +static void +xfs_growfs_rt_sb_fields( + struct xfs_trans *tp, + const struct xfs_mount *nmp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +} + +static int +xfs_growfs_rt_zoned( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_mount *nmp; + struct xfs_trans *tp; + xfs_rtbxlen_t freed_rtx; + int error; + + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, + mp->m_sb.sb_rextsize); + if (!nmp) + return -ENOMEM; + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); + + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); + if (error) + goto out_free; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + + xfs_growfs_rt_sb_fields(tp, nmp); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + + /* + * Ensure the mount RT feature flag is now set, and compute new + * maxlevels for rt btrees. + */ + mp->m_features |= XFS_FEAT_REALTIME; + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); + xfs_zoned_add_available(mp, freed_rtx); +out_free: + kfree(nmp); + return error; +} + static int xfs_growfs_rt_bmblock( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rfsblock_t nrblocks, xfs_agblock_t rextsize, xfs_fileoff_t bmbno) { - struct xfs_inode *rbmip = mp->m_rbmip; - struct xfs_inode *rsumip = mp->m_rsumip; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *rbmip = rtg_bitmap(rtg); + struct xfs_inode *rsumip = rtg_summary(rtg); struct xfs_rtalloc_args args = { .mp = mp, + .rtg = rtg, }; struct xfs_rtalloc_args nargs = { + .rtg = rtg, }; struct xfs_mount *nmp; - xfs_rfsblock_t nrblocks_step; xfs_rtbxlen_t freed_rtx; int error; - - nrblocks_step = (bmbno + 1) * NBBY * mp->m_sb.sb_blocksize * rextsize; - - nmp = nargs.mp = kmemdup(mp, sizeof(*mp), GFP_KERNEL); + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = nargs.mp = xfs_growfs_rt_alloc_fake_mount(mp, + xfs_growfs_rt_nrblocks(rtg, nrblocks, rextsize, bmbno), + rextsize); if (!nmp) return -ENOMEM; - /* - * Calculate new sb and mount fields for this round. - */ - nmp->m_sb.sb_rextsize = rextsize; - xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb); - nmp->m_sb.sb_rbmblocks = bmbno + 1; - nmp->m_sb.sb_rblocks = min(nrblocks, nrblocks_step); - nmp->m_sb.sb_rextents = xfs_rtb_to_rtx(nmp, nmp->m_sb.sb_rblocks); - nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents); - nmp->m_rsumlevels = nmp->m_sb.sb_rextslog + 1; - nmp->m_rsumblocks = xfs_rtsummary_blockcount(mp, nmp->m_rsumlevels, - nmp->m_sb.sb_rbmblocks); + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); /* * Recompute the growfsrt reservation from the new rsumsize, so that the @@ -748,8 +983,9 @@ xfs_growfs_rt_bmblock( goto out_free; nargs.tp = args.tp; - xfs_rtbitmap_lock(mp); - xfs_rtbitmap_trans_join(args.tp); + xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(args.tp, args.rtg, + XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP); /* * Update the bitmap inode's size ondisk and incore. We need to update @@ -780,30 +1016,19 @@ xfs_growfs_rt_bmblock( goto out_cancel; } + error = xfs_growfs_rt_init_rtsb(&nargs, rtg, &args); + if (error) + goto out_cancel; + /* * Update superblock fields. */ - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + xfs_growfs_rt_sb_fields(args.tp, nmp); /* * Free the new extent. */ - freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; - error = xfs_rtfree_range(&nargs, mp->m_sb.sb_rextents, freed_rtx); + error = xfs_growfs_rt_free_new(rtg, &nargs, &freed_rtx); xfs_rtbuf_cache_relse(&nargs); if (error) goto out_cancel; @@ -818,7 +1043,6 @@ xfs_growfs_rt_bmblock( */ mp->m_rsumlevels = nmp->m_rsumlevels; mp->m_rsumblocks = nmp->m_rsumblocks; - xfs_mount_sb_set_rextsize(mp, &mp->m_sb); /* * Recompute the growfsrt reservation from the new rsumsize. @@ -830,9 +1054,12 @@ xfs_growfs_rt_bmblock( goto out_free; /* - * Ensure the mount RT feature flag is now set. + * Ensure the mount RT feature flag is now set, and compute new + * maxlevels for rt btrees. */ mp->m_features |= XFS_FEAT_REALTIME; + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); kfree(nmp); return 0; @@ -844,6 +1071,15 @@ out_free: return error; } +static xfs_rtxnum_t +xfs_last_rtgroup_extents( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_rextents - + ((xfs_rtxnum_t)(mp->m_sb.sb_rgcount - 1) * + mp->m_sb.sb_rgextents); +} + /* * Calculate the last rbmblock currently used. * @@ -851,34 +1087,291 @@ out_free: */ static xfs_fileoff_t xfs_last_rt_bmblock( - struct xfs_mount *mp) + struct xfs_rtgroup *rtg) { - xfs_fileoff_t bmbno = mp->m_sb.sb_rbmblocks; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgnumber_t rgno = rtg_rgno(rtg); + xfs_fileoff_t bmbno = 0; + + ASSERT(!mp->m_sb.sb_rgcount || rgno >= mp->m_sb.sb_rgcount - 1); + + if (mp->m_sb.sb_rgcount && rgno == mp->m_sb.sb_rgcount - 1) { + xfs_rtxnum_t nrext = xfs_last_rtgroup_extents(mp); + + /* Also fill up the previous block if not entirely full. */ + bmbno = xfs_rtbitmap_blockcount_len(mp, nrext); + if (xfs_rtx_to_rbmword(mp, nrext) != 0) + bmbno--; + } - /* Skip the current block if it is exactly full. */ - if (xfs_rtx_to_rbmword(mp, mp->m_sb.sb_rextents) != 0) - bmbno--; return bmbno; } /* + * Allocate space to the bitmap and summary files, as necessary. + */ +static int +xfs_growfs_rt_alloc_blocks( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize, + xfs_extlen_t *nrbmblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *rbmip = rtg_bitmap(rtg); + struct xfs_inode *rsumip = rtg_summary(rtg); + xfs_extlen_t orbmblocks = 0; + xfs_extlen_t orsumblocks = 0; + struct xfs_mount *nmp; + int error = 0; + + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, rextsize); + if (!nmp) + return -ENOMEM; + *nrbmblocks = nmp->m_sb.sb_rbmblocks; + + if (xfs_has_rtgroups(mp)) { + /* + * For file systems with the rtgroups feature, the RT bitmap and + * summary are always fully allocated, which means that we never + * need to grow the existing files. + * + * But we have to be careful to only fill the bitmap until the + * end of the actually used range. + */ + if (rtg_rgno(rtg) == nmp->m_sb.sb_rgcount - 1) + *nrbmblocks = xfs_rtbitmap_blockcount_len(nmp, + xfs_last_rtgroup_extents(nmp)); + + if (mp->m_sb.sb_rgcount && + rtg_rgno(rtg) == mp->m_sb.sb_rgcount - 1) + goto out_free; + } else { + /* + * Get the old block counts for bitmap and summary inodes. + * These can't change since other growfs callers are locked out. + */ + orbmblocks = XFS_B_TO_FSB(mp, rbmip->i_disk_size); + orsumblocks = XFS_B_TO_FSB(mp, rsumip->i_disk_size); + } + + error = xfs_rtfile_initialize_blocks(rtg, XFS_RTGI_BITMAP, orbmblocks, + nmp->m_sb.sb_rbmblocks, NULL); + if (error) + goto out_free; + error = xfs_rtfile_initialize_blocks(rtg, XFS_RTGI_SUMMARY, orsumblocks, + nmp->m_rsumblocks, NULL); +out_free: + kfree(nmp); + return error; +} + +static int +xfs_growfs_rtg( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize) +{ + uint8_t *old_rsum_cache = NULL; + xfs_extlen_t bmblocks; + xfs_fileoff_t bmbno; + struct xfs_rtgroup *rtg; + unsigned int i; + int error; + + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) + return -EINVAL; + + for (i = 0; i < XFS_RTGI_MAX; i++) { + error = xfs_rtginode_ensure(rtg, i); + if (error) + goto out_rele; + } + + if (xfs_has_zoned(mp)) { + error = xfs_growfs_rt_zoned(rtg, nrblocks); + goto out_rele; + } + + error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); + if (error) + goto out_rele; + + if (bmblocks != rtg_mount(rtg)->m_sb.sb_rbmblocks) { + old_rsum_cache = rtg->rtg_rsum_cache; + error = xfs_alloc_rsum_cache(rtg, bmblocks); + if (error) + goto out_rele; + } + + for (bmbno = xfs_last_rt_bmblock(rtg); bmbno < bmblocks; bmbno++) { + error = xfs_growfs_rt_bmblock(rtg, nrblocks, rextsize, bmbno); + if (error) + goto out_error; + } + + kvfree(old_rsum_cache); + goto out_rele; + +out_error: + /* + * Reset rtg_extents to the old value if adding more blocks failed. + */ + xfs_rtgroup_calc_geometry(mp, rtg, rtg_rgno(rtg), mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); + if (old_rsum_cache) { + kvfree(rtg->rtg_rsum_cache); + rtg->rtg_rsum_cache = old_rsum_cache; + } +out_rele: + xfs_rtgroup_rele(rtg); + return error; +} + +int +xfs_growfs_check_rtgeom( + const struct xfs_mount *mp, + xfs_rfsblock_t dblocks, + xfs_rfsblock_t rblocks, + xfs_extlen_t rextsize) +{ + xfs_extlen_t min_logfsbs; + struct xfs_mount *nmp; + + nmp = xfs_growfs_rt_alloc_fake_mount(mp, rblocks, rextsize); + if (!nmp) + return -ENOMEM; + nmp->m_sb.sb_dblocks = dblocks; + + xfs_rtrmapbt_compute_maxlevels(nmp); + xfs_rtrefcountbt_compute_maxlevels(nmp); + xfs_trans_resv_calc(nmp, M_RES(nmp)); + + /* + * New summary size can't be more than half the size of the log. This + * prevents us from getting a log overflow, since we'll log basically + * the whole summary file at once. + */ + min_logfsbs = min_t(xfs_extlen_t, xfs_log_calc_minimum_size(nmp), + nmp->m_rsumblocks * 2); + + kfree(nmp); + + if (min_logfsbs > mp->m_sb.sb_logblocks) + return -EINVAL; + + if (xfs_has_zoned(mp)) { + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; + uint32_t rem; + + if (rextsize != 1) + return -EINVAL; + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); + if (rem) { + xfs_warn(mp, +"new RT volume size (%lld) not aligned to RT group size (%d)", + mp->m_sb.sb_rblocks, gblocks); + return -EINVAL; + } + } + + return 0; +} + +/* + * Compute the new number of rt groups and ensure that /rtgroups exists. + * + * Changing the rtgroup size is not allowed (even if the rt volume hasn't yet + * been initialized) because the userspace ABI doesn't support it. + */ +static int +xfs_growfs_rt_prep_groups( + struct xfs_mount *mp, + xfs_rfsblock_t rblocks, + xfs_extlen_t rextsize, + xfs_rgnumber_t *new_rgcount) +{ + int error; + + *new_rgcount = howmany_64(rblocks, mp->m_sb.sb_rgextents * rextsize); + if (*new_rgcount > XFS_MAX_RGNUMBER) + return -EINVAL; + + /* Make sure the /rtgroups dir has been created */ + if (!mp->m_rtdirip) { + struct xfs_trans *tp; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + error = xfs_rtginode_load_parent(tp); + xfs_trans_cancel(tp); + + if (error == -ENOENT) + error = xfs_rtginode_mkdir_parent(mp); + if (error) + return error; + } + + return 0; +} + +static bool +xfs_grow_last_rtg( + struct xfs_mount *mp) +{ + if (!xfs_has_rtgroups(mp)) + return true; + if (mp->m_sb.sb_rgcount == 0) + return false; + return xfs_rtgroup_extents(mp, mp->m_sb.sb_rgcount - 1) <= + mp->m_sb.sb_rgextents; +} + +/* + * Read in the last block of the RT device to make sure it is accessible. + */ +static int +xfs_rt_check_size( + struct xfs_mount *mp, + xfs_rfsblock_t last_block) +{ + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block); + struct xfs_buf *bp; + int error; + + if (XFS_BB_TO_FSB(mp, daddr) != last_block) { + xfs_warn(mp, "RT device size overflow: %llu != %llu", + XFS_BB_TO_FSB(mp, daddr), last_block); + return -EFBIG; + } + + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, + XFS_FSB_TO_BB(mp, 1), &bp, NULL); + if (error) + xfs_warn(mp, "cannot read last RT device sector (%lld)", + last_block); + else + xfs_buf_relse(bp); + return error; +} + +/* * Grow the realtime area of the filesystem. */ int xfs_growfs_rt( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_rt_t *in) /* growfs rt input struct */ + struct xfs_mount *mp, + struct xfs_growfs_rt *in) { - xfs_fileoff_t bmbno; /* bitmap block number */ - struct xfs_buf *bp; /* temporary buffer */ - int error; /* error return value */ - xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ - xfs_rtxnum_t nrextents; /* new number of realtime extents */ - xfs_extlen_t nrsumblocks; /* new number of summary blocks */ - xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ - xfs_extlen_t rsumblocks; /* current number of rt summary blks */ - uint8_t *rsum_cache; /* old summary cache */ - xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; + xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; + xfs_rgnumber_t new_rgcount = 1; + xfs_rgnumber_t rgno; + xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -889,15 +1382,9 @@ xfs_growfs_rt( if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; - /* - * Mount should fail if the rt bitmap/summary files don't load, but - * we'll check anyway. - */ - error = -EINVAL; - if (!mp->m_rbmip || !mp->m_rsumip) - goto out_unlock; /* Shrink not supported. */ + error = -EINVAL; if (in->newblocks <= mp->m_sb.sb_rblocks) goto out_unlock; /* Can only change rt extent size when adding rt volume. */ @@ -909,101 +1396,94 @@ xfs_growfs_rt( XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE) goto out_unlock; - /* Unsupported realtime features. */ + /* Check for features supported only on rtgroups filesystems. */ error = -EOPNOTSUPP; - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) + if (!xfs_has_rtgroups(mp)) { + if (xfs_has_rmapbt(mp)) + goto out_unlock; + if (xfs_has_quota(mp)) + goto out_unlock; + if (xfs_has_reflink(mp)) + goto out_unlock; + } else if (xfs_has_reflink(mp) && + !xfs_reflink_supports_rextsize(mp, in->extsize)) goto out_unlock; error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) goto out_unlock; - /* - * Read in the last block of the device, make sure it exists. - */ - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + + error = xfs_rt_check_size(mp, in->newblocks - 1); if (error) goto out_unlock; - xfs_buf_relse(bp); /* * Calculate new parameters. These are the final values to be reached. */ - nrextents = div_u64(in->newblocks, in->extsize); - if (nrextents == 0) { - error = -EINVAL; - goto out_unlock; - } - nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrsumblocks = xfs_rtsummary_blockcount(mp, - xfs_compute_rextslog(nrextents) + 1, nrbmblocks); - - /* - * New summary size can't be more than half the size of - * the log. This prevents us from getting a log overflow, - * since we'll log basically the whole summary file at once. - */ - if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) { - error = -EINVAL; + error = -EINVAL; + if (in->newblocks < in->extsize) goto out_unlock; - } - /* - * Get the old block counts for bitmap and summary inodes. - * These can't change since other growfs callers are locked out. - */ - rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_disk_size); - rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_disk_size); - /* - * Allocate space to the bitmap and summary files, as necessary. - */ - error = xfs_rtfile_initialize_blocks(mp->m_rbmip, rbmblocks, - nrbmblocks, NULL); - if (error) - goto out_unlock; - error = xfs_rtfile_initialize_blocks(mp->m_rsumip, rsumblocks, - nrsumblocks, NULL); + /* Make sure the new fs size won't cause problems with the log. */ + error = xfs_growfs_check_rtgeom(mp, mp->m_sb.sb_dblocks, in->newblocks, + in->extsize); if (error) goto out_unlock; - rsum_cache = mp->m_rsum_cache; - if (nrbmblocks != mp->m_sb.sb_rbmblocks) { - error = xfs_alloc_rsum_cache(mp, nrbmblocks); + if (xfs_has_rtgroups(mp)) { + error = xfs_growfs_rt_prep_groups(mp, in->newblocks, + in->extsize, &new_rgcount); if (error) goto out_unlock; } - /* Initialize the free space bitmap one bitmap block at a time. */ - for (bmbno = xfs_last_rt_bmblock(mp); bmbno < nrbmblocks; bmbno++) { - error = xfs_growfs_rt_bmblock(mp, in->newblocks, in->extsize, - bmbno); + if (xfs_grow_last_rtg(mp)) { + error = xfs_growfs_rtg(mp, old_rgcount - 1, in->newblocks, + in->extsize); if (error) - goto out_free; + goto out_unlock; } - if (old_rextsize != in->extsize) { - error = xfs_growfs_rt_fixup_extsize(mp); + for (rgno = old_rgcount; rgno < new_rgcount; rgno++) { + xfs_rtbxlen_t rextents = div_u64(in->newblocks, in->extsize); + + error = xfs_rtgroup_alloc(mp, rgno, new_rgcount, rextents); if (error) - goto out_free; + goto out_unlock; + + error = xfs_growfs_rtg(mp, rgno, in->newblocks, in->extsize); + if (error) { + struct xfs_rtgroup *rtg; + + rtg = xfs_rtgroup_grab(mp, rgno); + if (!WARN_ON_ONCE(!rtg)) { + xfs_rtunmount_rtg(rtg); + xfs_rtgroup_rele(rtg); + xfs_rtgroup_free(mp, rgno); + } + break; + } } - /* Update secondary superblocks now the physical grow has completed */ - error = xfs_update_secondary_sbs(mp); + if (!error && old_rextsize != in->extsize) + error = xfs_growfs_rt_fixup_extsize(mp); -out_free: /* - * If we had to allocate a new rsum_cache, we either need to free the - * old one (if we succeeded) or free the new one and restore the old one - * (if there was an error). + * Update secondary superblocks now the physical grow has completed. + * + * Also do this in case of an error as we might have already + * successfully updated one or more RTGs and incremented sb_rgcount. */ - if (rsum_cache != mp->m_rsum_cache) { - if (error) { - kvfree(mp->m_rsum_cache); - mp->m_rsum_cache = rsum_cache; - } else { - kvfree(rsum_cache); - } + if (!xfs_is_shutdown(mp)) { + int error2 = xfs_update_secondary_sbs(mp); + + if (!error) + error = error2; + + /* Reset the rt metadata btree space reservations. */ + error2 = xfs_metafile_resv_init(mp); + if (error2 && error2 != -ENOSPC) + error = error2; } out_unlock: @@ -1011,54 +1491,79 @@ out_unlock: return error; } -/* - * Initialize realtime fields in the mount structure. - */ -int /* error */ -xfs_rtmount_init( - struct xfs_mount *mp) /* file system mount structure */ +/* Read the realtime superblock and attach it to the mount. */ +int +xfs_rtmount_readsb( + struct xfs_mount *mp) { - struct xfs_buf *bp; /* buffer for last block of subvolume */ - struct xfs_sb *sbp; /* filesystem superblock copy in mount */ - xfs_daddr_t d; /* address of last block of subvolume */ + struct xfs_buf *bp; int error; - sbp = &mp->m_sb; - if (sbp->sb_rblocks == 0) + if (!xfs_has_rtsb(mp)) + return 0; + if (mp->m_sb.sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { xfs_warn(mp, "Filesystem has a realtime volume, use rtdev=device option"); return -ENODEV; } - mp->m_rsumlevels = sbp->sb_rextslog + 1; - mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, - mp->m_sb.sb_rbmblocks); - mp->m_rbmip = mp->m_rsumip = NULL; - /* - * Check that the realtime section is an ok size. - */ - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { - xfs_warn(mp, "realtime mount -- %llu != %llu", - (unsigned long long) XFS_BB_TO_FSB(mp, d), - (unsigned long long) mp->m_sb.sb_rblocks); - return -EFBIG; - } - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + + /* m_blkbb_log is not set up yet */ + error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, + mp->m_sb.sb_blocksize >> BBSHIFT, &bp, + &xfs_rtsb_buf_ops); if (error) { - xfs_warn(mp, "realtime device size check failed"); + xfs_warn(mp, "rt sb validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; } - xfs_buf_relse(bp); + + mp->m_rtsb_bp = bp; + xfs_buf_unlock(bp); return 0; } +/* Detach the realtime superblock from the mount and free it. */ +void +xfs_rtmount_freesb( + struct xfs_mount *mp) +{ + struct xfs_buf *bp = mp->m_rtsb_bp; + + if (!bp) + return; + + xfs_buf_lock(bp); + mp->m_rtsb_bp = NULL; + xfs_buf_relse(bp); +} + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + if (mp->m_rtdev_targp == NULL) { + xfs_warn(mp, + "Filesystem has a realtime volume, use rtdev=device option"); + return -ENODEV; + } + + mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); + + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1); +} + static int xfs_rtalloc_count_frextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -1080,17 +1585,23 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, - &val); - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); - if (error) - return error; + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_all(rtg, NULL, + xfs_rtalloc_count_frextent, &val); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents); return 0; } @@ -1101,17 +1612,12 @@ xfs_rtalloc_reinit_frextents( */ static inline int xfs_rtmount_iread_extents( - struct xfs_inode *ip, - unsigned int lock_class) + struct xfs_trans *tp, + struct xfs_inode *ip) { - struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(ip->i_mount, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) @@ -1124,54 +1630,69 @@ xfs_rtmount_iread_extents( } out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); - xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } +static int +xfs_rtmount_rtg( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + int error, i; + + for (i = 0; i < XFS_RTGI_MAX; i++) { + error = xfs_rtginode_load(rtg, i, tp); + if (error) + return error; + + if (rtg->rtg_inodes[i]) { + error = xfs_rtmount_iread_extents(tp, + rtg->rtg_inodes[i]); + if (error) + return error; + } + } + + if (xfs_has_zoned(mp)) + return 0; + return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. */ -int /* error */ +int xfs_rtmount_inodes( - xfs_mount_t *mp) /* file system mount structure */ + struct xfs_mount *mp) { - int error; /* error return value */ - xfs_sb_t *sbp; + struct xfs_trans *tp; + struct xfs_rtgroup *rtg = NULL; + int error; - sbp = &mp->m_sb; - error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); - if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - ASSERT(mp->m_rbmip != NULL); - - error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); - if (error) - goto out_rele_bitmap; - error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); - if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY); - if (error) - goto out_rele_bitmap; - ASSERT(mp->m_rsumip != NULL); - - error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); - if (error) - goto out_rele_summary; + if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) { + error = xfs_rtginode_load_parent(tp); + if (error) + goto out_cancel; + } - error = xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); - if (error) - goto out_rele_summary; - return 0; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_rtmount_rtg(mp, tp, rtg); + if (error) { + xfs_rtgroup_rele(rtg); + xfs_rtunmount_inodes(mp); + break; + } + } -out_rele_summary: - xfs_irele(mp->m_rsumip); -out_rele_bitmap: - xfs_irele(mp->m_rbmip); +out_cancel: + xfs_trans_cancel(tp); return error; } @@ -1179,11 +1700,11 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { - kvfree(mp->m_rsum_cache); - if (mp->m_rbmip) - xfs_irele(mp->m_rbmip); - if (mp->m_rsumip) - xfs_irele(mp->m_rsumip); + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_rtunmount_rtg(rtg); + xfs_rtginode_irele(&mp->m_rtdirip); } /* @@ -1195,28 +1716,29 @@ xfs_rtunmount_inodes( */ static xfs_rtxnum_t xfs_rtpick_extent( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, xfs_rtxlen_t len) /* allocation length (rtextents) */ { - xfs_rtxnum_t b; /* result rtext */ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *rbmip = rtg_bitmap(rtg); + xfs_rtxnum_t b = 0; /* result rtext */ int log2; /* log of sequence number */ uint64_t resid; /* residual after log removed */ uint64_t seq; /* sequence number of file creation */ struct timespec64 ts; /* timespec in inode */ - xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - ts = inode_get_atime(VFS_I(mp->m_rbmip)); - if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { - mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; + ts = inode_get_atime(VFS_I(rbmip)); + if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { + rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; seq = 0; } else { seq = ts.tv_sec; } - if ((log2 = xfs_highbit64(seq)) == -1) - b = 0; - else { + log2 = xfs_highbit64(seq); + if (log2 != -1) { resid = seq - (1ULL << log2); b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> (log2 + 1); @@ -1226,8 +1748,8 @@ xfs_rtpick_extent( b = mp->m_sb.sb_rextents - len; } ts.tv_sec = seq + 1; - inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + inode_set_atime_to_ts(VFS_I(rbmip), ts); + xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE); return b; } @@ -1260,9 +1782,118 @@ xfs_rtalloc_align_minmax( *raminlen = newminlen; } +/* Given a free extent, find any part of it that isn't busy, if possible. */ +STATIC bool +xfs_rtalloc_check_busy( + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, + xfs_rtxlen_t minlen_rtx, + xfs_rtxlen_t maxlen_rtx, + xfs_rtxlen_t len_rtx, + xfs_rtxlen_t prod, + xfs_rtxnum_t rtx, + xfs_rtxlen_t *reslen, + xfs_rtxnum_t *resrtx, + unsigned *busy_gen) +{ + struct xfs_rtgroup *rtg = args->rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_agblock_t rgbno = xfs_rtx_to_rgbno(rtg, rtx); + xfs_rgblock_t min_rgbno = xfs_rtx_to_rgbno(rtg, start); + xfs_extlen_t minlen = xfs_rtxlen_to_extlen(mp, minlen_rtx); + xfs_extlen_t len = xfs_rtxlen_to_extlen(mp, len_rtx); + xfs_extlen_t diff; + bool busy; + + busy = xfs_extent_busy_trim(rtg_group(rtg), minlen, + xfs_rtxlen_to_extlen(mp, maxlen_rtx), &rgbno, &len, + busy_gen); + + /* + * If we have a largish extent that happens to start before min_rgbno, + * see if we can shift it into range... + */ + if (rgbno < min_rgbno && rgbno + len > min_rgbno) { + diff = min_rgbno - rgbno; + if (len > diff) { + rgbno += diff; + len -= diff; + } + } + + if (prod > 1 && len >= minlen) { + xfs_rgblock_t aligned_rgbno = roundup(rgbno, prod); + + diff = aligned_rgbno - rgbno; + + *resrtx = xfs_rgbno_to_rtx(mp, aligned_rgbno); + *reslen = xfs_extlen_to_rtxlen(mp, + diff >= len ? 0 : len - diff); + } else { + *resrtx = xfs_rgbno_to_rtx(mp, rgbno); + *reslen = xfs_extlen_to_rtxlen(mp, len); + } + + return busy; +} + +/* + * Adjust the given free extent so that it isn't busy, or flush the log and + * wait for the space to become unbusy. Only needed for rtgroups. + */ +STATIC int +xfs_rtallocate_adjust_for_busy( + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, + xfs_rtxlen_t minlen, + xfs_rtxlen_t maxlen, + xfs_rtxlen_t *len, + xfs_rtxlen_t prod, + xfs_rtxnum_t *rtx) +{ + xfs_rtxnum_t resrtx; + xfs_rtxlen_t reslen; + unsigned busy_gen; + bool busy; + int error; + +again: + busy = xfs_rtalloc_check_busy(args, start, minlen, maxlen, *len, prod, + *rtx, &reslen, &resrtx, &busy_gen); + if (!busy) + return 0; + + if (reslen < minlen || (start != 0 && resrtx != *rtx)) { + /* + * Enough of the extent was busy that we cannot satisfy the + * allocation, or this is a near allocation and the start of + * the extent is busy. Flush the log and wait for the busy + * situation to resolve. + */ + trace_xfs_rtalloc_extent_busy(args->rtg, start, minlen, maxlen, + *len, prod, *rtx, busy_gen); + + error = xfs_extent_busy_flush(args->tp, rtg_group(args->rtg), + busy_gen, 0); + if (error) + return error; + + goto again; + } + + /* Some of the free space wasn't busy, hand that back to the caller. */ + trace_xfs_rtalloc_extent_busy_trim(args->rtg, *rtx, *len, resrtx, + reslen); + *len = reslen; + *rtx = resrtx; + + return 0; +} + static int -xfs_rtallocate( +xfs_rtallocate_rtg( struct xfs_trans *tp, + xfs_rgnumber_t rgno, xfs_rtblock_t bno_hint, xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, @@ -1282,12 +1913,33 @@ xfs_rtallocate( xfs_rtxlen_t len = 0; int error = 0; + args.rtg = xfs_rtgroup_grab(args.mp, rgno); + if (!args.rtg) + return -ENOSPC; + /* - * Lock out modifications to both the RT bitmap and summary inodes. + * We need to lock out modifications to both the RT bitmap and summary + * inodes for finding free space in xfs_rtallocate_extent_{near,size} + * and join the bitmap and summary inodes for the actual allocation + * down in xfs_rtallocate_range. + * + * For RTG-enabled file system we don't want to join the inodes to the + * transaction until we are committed to allocate to allocate from this + * RTG so that only one inode of each type is locked at a time. + * + * But for pre-RTG file systems we need to already to join the bitmap + * inode to the transaction for xfs_rtpick_extent, which bumps the + * sequence number in it, so we'll have to join the inode to the + * transaction early here. + * + * This is all a bit messy, but at least the mess is contained in + * this function. */ if (!*rtlocked) { - xfs_rtbitmap_lock(args.mp); - xfs_rtbitmap_trans_join(tp); + xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP); + if (!xfs_has_rtgroups(args.mp)) + xfs_rtgroup_trans_join(tp, args.rtg, + XFS_RTGLOCK_BITMAP); *rtlocked = true; } @@ -1295,10 +1947,10 @@ xfs_rtallocate( * For an allocation to an empty file at offset 0, pick an extent that * will space things out in the rt area. */ - if (bno_hint) + if (bno_hint != NULLFSBLOCK) start = xfs_rtb_to_rtx(args.mp, bno_hint); - else if (initial_user_data) - start = xfs_rtpick_extent(args.mp, tp, maxlen); + else if (!xfs_has_rtgroups(args.mp) && initial_user_data) + start = xfs_rtpick_extent(args.rtg, tp, maxlen); if (start) { error = xfs_rtallocate_extent_near(&args, start, minlen, maxlen, @@ -1318,8 +1970,20 @@ xfs_rtallocate( prod, &rtx); } - if (error) + if (error) { + if (xfs_has_rtgroups(args.mp)) + goto out_unlock; goto out_release; + } + + if (xfs_has_rtgroups(args.mp)) { + error = xfs_rtallocate_adjust_for_busy(&args, start, minlen, + maxlen, &len, prod, &rtx); + if (error) + goto out_unlock; + + xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP); + } error = xfs_rtallocate_range(&args, rtx, len); if (error) @@ -1328,12 +1992,64 @@ xfs_rtallocate( xfs_trans_mod_sb(tp, wasdel ? XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, -(long)len); - *bno = xfs_rtx_to_rtb(args.mp, rtx); + *bno = xfs_rtx_to_rtb(args.rtg, rtx); *blen = xfs_rtxlen_to_extlen(args.mp, len); out_release: + xfs_rtgroup_rele(args.rtg); xfs_rtbuf_cache_relse(&args); return error; +out_unlock: + xfs_rtgroup_unlock(args.rtg, XFS_RTGLOCK_BITMAP); + *rtlocked = false; + goto out_release; +} + +int +xfs_rtallocate_rtgs( + struct xfs_trans *tp, + xfs_fsblock_t bno_hint, + xfs_rtxlen_t minlen, + xfs_rtxlen_t maxlen, + xfs_rtxlen_t prod, + bool wasdel, + bool initial_user_data, + xfs_rtblock_t *bno, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rgnumber_t start_rgno, rgno; + int error; + + /* + * For now this just blindly iterates over the RTGs for an initial + * allocation. We could try to keep an in-memory rtg_longest member + * to avoid the locking when just looking for big enough free space, + * but for now this keeps things simple. + */ + if (bno_hint != NULLFSBLOCK) + start_rgno = xfs_rtb_to_rgno(mp, bno_hint); + else + start_rgno = (atomic_inc_return(&mp->m_rtgrotor) - 1) % + mp->m_sb.sb_rgcount; + + rgno = start_rgno; + do { + bool rtlocked = false; + + error = xfs_rtallocate_rtg(tp, rgno, bno_hint, minlen, maxlen, + prod, wasdel, initial_user_data, &rtlocked, + bno, blen); + if (error != -ENOSPC) + return error; + ASSERT(!rtlocked); + + if (++rgno == mp->m_sb.sb_rgcount) + rgno = 0; + bno_hint = NULLFSBLOCK; + } while (rgno != start_rgno); + + return -ENOSPC; } static int @@ -1354,7 +2070,10 @@ xfs_rtallocate_align( if (*noalign) { align = mp->m_sb.sb_rextsize; } else { - align = xfs_get_extsz_hint(ap->ip); + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else + align = xfs_get_extsz_hint(ap->ip); if (!align) align = 1; if (align == mp->m_sb.sb_rextsize) @@ -1422,6 +2141,8 @@ xfs_bmap_rtalloc( ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; int error; + ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); + retry: error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign); if (error) @@ -1430,9 +2151,16 @@ retry: if (xfs_bmap_adjacent(ap)) bno_hint = ap->blkno; - error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod, - ap->wasdel, initial_user_data, &rtlocked, - &ap->blkno, &ap->length); + if (xfs_has_rtgroups(ap->ip->i_mount)) { + error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen, + prod, ap->wasdel, initial_user_data, + &ap->blkno, &ap->length); + } else { + error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen, + prod, ap->wasdel, initial_user_data, + &rtlocked, &ap->blkno, &ap->length); + } + if (error == -ENOSPC) { if (!noalign) { /* diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index a6836da9bebe..78a690b489ed 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -12,6 +12,10 @@ struct xfs_mount; struct xfs_trans; #ifdef CONFIG_XFS_RT +/* rtgroup superblock initialization */ +int xfs_rtmount_readsb(struct xfs_mount *mp); +void xfs_rtmount_freesb(struct xfs_mount *mp); + /* * Initialize realtime fields in the mount structure. */ @@ -39,9 +43,13 @@ xfs_growfs_rt( xfs_growfs_rt_t *in); /* user supplied growfs struct */ int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); +int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, xfs_rfsblock_t dblocks, + xfs_rfsblock_t rblocks, xfs_agblock_t rextsize); #else # define xfs_growfs_rt(mp,in) (-ENOSYS) # define xfs_rtalloc_reinit_frextents(m) (0) +# define xfs_rtmount_readsb(mp) (0) +# define xfs_rtmount_freesb(mp) ((void)0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ @@ -54,6 +62,19 @@ xfs_rtmount_init( } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) + +static inline int +xfs_growfs_check_rtgeom(const struct xfs_mount *mp, + xfs_rfsblock_t dblocks, xfs_rfsblock_t rblocks, + xfs_extlen_t rextsize) +{ + return 0; +} #endif /* CONFIG_XFS_RT */ +int xfs_rtallocate_rtgs(struct xfs_trans *tp, xfs_fsblock_t bno_hint, + xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, xfs_rtxlen_t prod, + bool wasdel, bool initial_user_data, xfs_rtblock_t *bno, + xfs_extlen_t *blen); + #endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index ed97d72caa66..35c7fb3ba324 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -52,7 +52,10 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "rmapbt", xfsstats_offset(xs_refcbt_2) }, { "refcntbt", xfsstats_offset(xs_rmap_mem_2) }, { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) }, - { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)}, + { "rcbagbt", xfsstats_offset(xs_rtrmap_2) }, + { "rtrmapbt", xfsstats_offset(xs_rtrmap_mem_2)}, + { "rtrmapbt_mem", xfsstats_offset(xs_rtrefcbt_2) }, + { "rtrefcntbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; @@ -115,10 +118,11 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) static int xqm_proc_show(struct seq_file *m, void *v) { - /* maximum; incore; ratio free to inuse; freelist */ - seq_printf(m, "%d\t%d\t%d\t%u\n", + /* maximum; incore; ratio free to inuse; freelist; rtquota */ + seq_printf(m, "%d\t%d\t%d\t%u\t%s\n", 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT), - 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1)); + 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1), + IS_ENABLED(CONFIG_XFS_RT) ? "rtquota" : "quota"); return 0; } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index a61fb56ed2e6..15ba1abcf253 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -127,6 +127,9 @@ struct __xfsstats { uint32_t xs_refcbt_2[__XBTS_MAX]; uint32_t xs_rmap_mem_2[__XBTS_MAX]; uint32_t xs_rcbag_2[__XBTS_MAX]; + uint32_t xs_rtrmap_2[__XBTS_MAX]; + uint32_t xs_rtrmap_mem_2[__XBTS_MAX]; + uint32_t xs_rtrefcbt_2[__XBTS_MAX]; uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fbb3a1594c0d..0bc4b5489078 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -45,6 +45,8 @@ #include "xfs_rtbitmap.h" #include "xfs_exchmaps_item.h" #include "xfs_parent.h" +#include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -66,6 +68,9 @@ enum xfs_dax_mode { XFS_DAX_NEVER = 2, }; +/* Were quota mount options provided? Must use the upper 16 bits of qflags. */ +#define XFS_QFLAGS_MNTOPTS (1U << 31) + static void xfs_mount_set_dax_mode( struct xfs_mount *mp, @@ -105,7 +110,8 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -150,6 +156,10 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), + fsparam_string("max_atomic_write", Opt_max_atomic_write), {} }; @@ -178,6 +188,7 @@ xfs_fs_show_options( { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -229,6 +240,12 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + if (mp->m_awu_max_bytes) + seq_printf(m, ",max_atomic_write=%lluk", + mp->m_awu_max_bytes >> 10); + return 0; } @@ -238,7 +255,7 @@ xfs_set_inode_alloc_perag( xfs_ino_t ino, xfs_agnumber_t max_metadata) { - if (!xfs_is_inode32(pag->pag_mount)) { + if (!xfs_is_inode32(pag_mount(pag))) { set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); return false; @@ -251,7 +268,7 @@ xfs_set_inode_alloc_perag( } set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); - if (pag->pag_agno < max_metadata) + if (pag_agno(pag) < max_metadata) set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); else clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); @@ -367,10 +384,11 @@ xfs_blkdev_get( struct file **bdev_filep) { int error = 0; + blk_mode_t mode; - *bdev_filep = bdev_file_open_by_path(name, - BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, - mp->m_super, &fs_holder_ops); + mode = sb_open_mode(mp->m_super->s_flags); + *bdev_filep = bdev_file_open_by_path(name, mode, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdev_filep)) { error = PTR_ERR(*bdev_filep); *bdev_filep = NULL; @@ -468,21 +486,29 @@ xfs_open_devices( /* * Setup xfs_mount buffer target pointers */ - error = -ENOMEM; mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); - if (!mp->m_ddev_targp) + if (IS_ERR(mp->m_ddev_targp)) { + error = PTR_ERR(mp->m_ddev_targp); + mp->m_ddev_targp = NULL; goto out_close_rtdev; + } if (rtdev_file) { mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); - if (!mp->m_rtdev_targp) + if (IS_ERR(mp->m_rtdev_targp)) { + error = PTR_ERR(mp->m_rtdev_targp); + mp->m_rtdev_targp = NULL; goto out_free_ddev_targ; + } } if (logdev_file && file_bdev(logdev_file) != ddev) { mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); - if (!mp->m_logdev_targp) + if (IS_ERR(mp->m_logdev_targp)) { + error = PTR_ERR(mp->m_logdev_targp); + mp->m_logdev_targp = NULL; goto out_free_rtdev_targ; + } } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ @@ -515,7 +541,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -524,13 +550,21 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, + error = xfs_configure_buftarg(mp->m_logdev_targp, log_sector_size); if (error) return error; } - if (mp->m_rtdev_targp) { - error = xfs_setsize_buftarg(mp->m_rtdev_targp, + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { + error = xfs_configure_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -747,13 +781,24 @@ xfs_fs_drop_inode( return generic_drop_inode(inode); } +STATIC void +xfs_fs_evict_inode( + struct inode *inode) +{ + if (IS_DAX(inode)) + dax_break_layout_final(inode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + static void xfs_mount_free( struct xfs_mount *mp) { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_rtdev_targp); if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); @@ -810,25 +855,85 @@ xfs_fs_sync_fs( if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); } return 0; } +static xfs_extlen_t +xfs_internal_log_size( + struct xfs_mount *mp) +{ + if (!mp->m_sb.sb_logstart) + return 0; + return mp->m_sb.sb_logblocks; +} + +static void +xfs_statfs_data( + struct xfs_mount *mp, + struct kstatfs *st) +{ + int64_t fdblocks = + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); + + /* make sure st->f_bfree does not underflow */ + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); + + /* + * sb_dblocks can change during growfs, but nothing cares about reporting + * the old or new value during growfs. + */ + st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp); +} + +/* + * When stat(v)fs is called on a file with the realtime bit set or a directory + * with the rtinherit bit, report freespace information for the RT device + * instead of the main data device. + */ +static void +xfs_statfs_rt( + struct xfs_mount *mp, + struct kstatfs *st) +{ + st->f_bfree = xfs_rtbxlen_to_blen(mp, + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); +} + +static void +xfs_statfs_inodes( + struct xfs_mount *mp, + struct kstatfs *st) +{ + uint64_t icount = percpu_counter_sum(&mp->m_icount); + uint64_t ifree = percpu_counter_sum(&mp->m_ifree); + uint64_t fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree); + + st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); + if (M_IGEO(mp)->maxicount) + st->f_files = min_t(typeof(st->f_files), st->f_files, + M_IGEO(mp)->maxicount); + + /* If sb_icount overshot maxicount, report actual allocation */ + st->f_files = max_t(typeof(st->f_files), st->f_files, + mp->m_sb.sb_icount); + + /* Make sure st->f_ffree does not underflow */ + st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree)); +} + STATIC int xfs_fs_statfs( struct dentry *dentry, - struct kstatfs *statp) + struct kstatfs *st) { struct xfs_mount *mp = XFS_M(dentry->d_sb); - xfs_sb_t *sbp = &mp->m_sb; struct xfs_inode *ip = XFS_I(d_inode(dentry)); - uint64_t fakeinos, id; - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - xfs_extlen_t lsize; - int64_t ffree; /* * Expedite background inodegc but don't wait. We do not want to block @@ -836,80 +941,58 @@ xfs_fs_statfs( */ xfs_inodegc_push(mp); - statp->f_type = XFS_SUPER_MAGIC; - statp->f_namelen = MAXNAMELEN - 1; - - id = huge_encode_dev(mp->m_ddev_targp->bt_dev); - statp->f_fsid = u64_to_fsid(id); - - icount = percpu_counter_sum(&mp->m_icount); - ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - - spin_lock(&mp->m_sb_lock); - statp->f_bsize = sbp->sb_blocksize; - lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; - statp->f_blocks = sbp->sb_dblocks - lsize; - spin_unlock(&mp->m_sb_lock); - - /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, 0, - fdblocks - xfs_fdblocks_unavailable(mp)); - statp->f_bavail = statp->f_bfree; - - fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); - statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); - if (M_IGEO(mp)->maxicount) - statp->f_files = min_t(typeof(statp->f_files), - statp->f_files, - M_IGEO(mp)->maxicount); - - /* If sb_icount overshot maxicount, report actual allocation */ - statp->f_files = max_t(typeof(statp->f_files), - statp->f_files, - sbp->sb_icount); + st->f_type = XFS_SUPER_MAGIC; + st->f_namelen = MAXNAMELEN - 1; + st->f_bsize = mp->m_sb.sb_blocksize; + st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev)); - /* make sure statp->f_ffree does not underflow */ - ffree = statp->f_files - (icount - ifree); - statp->f_ffree = max_t(int64_t, ffree, 0); + xfs_statfs_data(mp, st); + xfs_statfs_inodes(mp, st); + if (XFS_IS_REALTIME_MOUNT(mp) && + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) + xfs_statfs_rt(mp, st); if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) - xfs_qm_statvfs(ip, statp); - - if (XFS_IS_REALTIME_MOUNT(mp) && - (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { - s64 freertx; - - statp->f_blocks = sbp->sb_rblocks; - freertx = percpu_counter_sum_positive(&mp->m_frextents); - statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx); - } + xfs_qm_statvfs(ip, st); + /* + * XFS does not distinguish between blocks available to privileged and + * unprivileged users. + */ + st->f_bavail = st->f_bfree; return 0; } STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +xfs_save_resvblks( + struct xfs_mount *mp) { - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, 0); + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); + } } STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +xfs_restore_resvblks( + struct xfs_mount *mp) { - uint64_t resblks; + uint64_t resblks; + enum xfs_free_counter i; - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); - - xfs_reserve_blocks(mp, resblks); + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); + } } /* @@ -946,6 +1029,7 @@ xfs_fs_freeze( if (ret && !xfs_is_readonly(mp)) { xfs_blockgc_start(mp); xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); } return ret; @@ -967,6 +1051,7 @@ xfs_fs_unfreeze( * filesystem. */ if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); xfs_blockgc_start(mp); xfs_inodegc_start(mp); } @@ -1028,6 +1113,19 @@ xfs_finish_flags( return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1035,7 +1133,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1045,30 +1144,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); - if (error) - goto free_ifree; - error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) - goto free_fdblocks; + goto free_ifree; error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); - if (error) - goto free_delalloc_rt; + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } return 0; -free_delalloc_rt: +free_freecounters: + while (--i >= 0) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); -free_fdblocks: - percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1082,24 +1180,28 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); percpu_counter_destroy(&mp->m_delalloc_rtextents); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); - percpu_counter_destroy(&mp->m_frextents); } static int @@ -1144,6 +1246,7 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); xfs_unmountfs(mp); + xfs_rtmount_freesb(mp); xfs_freesb(mp); xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); @@ -1179,11 +1282,24 @@ xfs_fs_shutdown( xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); } +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .dirty_inode = xfs_fs_dirty_inode, .drop_inode = xfs_fs_drop_inode, + .evict_inode = xfs_fs_evict_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, @@ -1193,6 +1309,7 @@ static const struct super_operations xfs_super_operations = { .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, }; static int @@ -1230,6 +1347,42 @@ suffix_kstrtoint( return ret; } +static int +suffix_kstrtoull( + const char *s, + unsigned int base, + unsigned long long *res) +{ + int last, shift_left_factor = 0; + unsigned long long _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoull(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + static inline void xfs_fs_warn_deprecated( struct fs_context *fc, @@ -1261,6 +1414,8 @@ xfs_fs_parse_param( int size = 0; int opt; + BUILD_BUG_ON(XFS_QFLAGS_MNTOPTS & XFS_MOUNT_QUOTA_ALL); + opt = fs_parse(fc, xfs_fs_parameters, param, &result); if (opt < 0) return opt; @@ -1338,32 +1493,39 @@ xfs_fs_parse_param( case Opt_noquota: parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_quota: case Opt_uquota: case Opt_usrquota: parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_qnoenforce: case Opt_uqnoenforce: parsing_mp->m_qflags |= XFS_UQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_pquota: case Opt_prjquota: parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_pqnoenforce: parsing_mp->m_qflags |= XFS_PQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_gquota: case Opt_grpquota: parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_gqnoenforce: parsing_mp->m_qflags |= XFS_GQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_discard: parsing_mp->m_features |= XFS_FEAT_DISCARD; @@ -1396,6 +1558,23 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; + case Opt_max_atomic_write: + if (suffix_kstrtoull(param->string, 10, + &parsing_mp->m_awu_max_bytes)) { + xfs_warn(parsing_mp, + "max atomic write size must be positive integer"); + return -EINVAL; + } + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1430,7 +1609,8 @@ xfs_fs_validate_params( return -EINVAL; } - if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) { + if (!IS_ENABLED(CONFIG_XFS_QUOTA) && + (mp->m_qflags & ~XFS_QFLAGS_MNTOPTS)) { xfs_warn(mp, "quota support not available in this kernel."); return -EINVAL; } @@ -1620,8 +1800,12 @@ xfs_fs_fill_super( #endif } - /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_has_needsrepair(mp)) { + /* + * Filesystem claims it needs repair, so refuse the mount unless + * norecovery is also specified, in which case the filesystem can + * be mounted with no risk of further damage. + */ + if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; @@ -1657,9 +1841,7 @@ xfs_fs_fill_super( goto out_free_sb; } - xfs_warn(mp, -"EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.", - mp->m_sb.sb_blocksize); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LBS); } /* Ensure this filesystem fits in the page cache limits */ @@ -1691,10 +1873,14 @@ xfs_fs_fill_super( goto out_free_sb; } - error = xfs_filestream_mount(mp); + error = xfs_rtmount_readsb(mp); if (error) goto out_free_sb; + error = xfs_filestream_mount(mp); + if (error) + goto out_free_rtsb; + /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. @@ -1713,7 +1899,7 @@ xfs_fs_fill_super( sb->s_time_max = XFS_LEGACY_TIME_MAX; } trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; set_posix_acl_flag(sb); @@ -1733,10 +1919,31 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { + xfs_alert(mp, + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + } + if (xfs_has_reflink(mp)) { - if (mp->m_sb.sb_rblocks) { + if (xfs_has_realtime(mp) && + !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) { + xfs_alert(mp, + "reflink not compatible with realtime extent size %u!", + mp->m_sb.sb_rextsize); + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_has_zoned(mp)) { xfs_alert(mp, - "reflink not compatible with realtime device!"); + "reflink not compatible with zoned RT device!"); error = -EINVAL; goto out_filestream_unmount; } @@ -1747,20 +1954,13 @@ xfs_fs_fill_super( } } - if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, - "reverse mapping btree not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; - } - - if (xfs_has_exchange_range(mp)) - xfs_warn(mp, - "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!"); - - if (xfs_has_parent(mp)) - xfs_warn(mp, - "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!"); + /* + * If no quota mount options were provided, maybe we'll try to pick + * up the quota accounting and enforcement flags from the ondisk sb. + */ + if (!(mp->m_qflags & XFS_QFLAGS_MNTOPTS)) + xfs_set_resuming_quotaon(mp); + mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; error = xfs_mountfs(mp); if (error) @@ -1781,6 +1981,8 @@ xfs_fs_fill_super( out_filestream_unmount: xfs_filestream_unmount(mp); + out_free_rtsb: + xfs_rtmount_freesb(mp); out_free_sb: xfs_freesb(mp); out_free_scrub_stats: @@ -1800,7 +2002,7 @@ xfs_fs_fill_super( out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - goto out_free_sb; + goto out_free_rtsb; } static int @@ -1817,6 +2019,20 @@ xfs_remount_rw( struct xfs_sb *sbp = &mp->m_sb; int error; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && + bdev_read_only(mp->m_logdev_targp->bt_bdev)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only logdev"); + return -EACCES; + } + + if (mp->m_rtdev_targp && + bdev_read_only(mp->m_rtdev_targp->bt_bdev)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only rtdev"); + return -EACCES; + } + if (xfs_has_norecovery(mp)) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); @@ -1863,6 +2079,9 @@ xfs_remount_rw( /* Re-enable the background inode inactivation worker. */ xfs_inodegc_start(mp); + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + return 0; } @@ -1907,6 +2126,9 @@ xfs_remount_ro( */ xfs_inodegc_stop(mp); + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + /* Free the per-AG metadata reservation pool. */ xfs_fs_unreserve_ag_blocks(mp); @@ -1946,6 +2168,8 @@ xfs_fs_reconfigure( int flags = fc->sb_flags; int error; + new_mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; + /* version 5 superblocks always support version counters. */ if (xfs_has_crc(mp)) fc->sb_flags |= SB_I_VERSION; @@ -1954,6 +2178,29 @@ xfs_fs_reconfigure( if (error) return error; + /* attr2 -> noattr2 */ + if (xfs_has_noattr2(new_mp)) { + if (xfs_has_crc(mp)) { + xfs_warn(mp, + "attr2 is always enabled for a V5 filesystem - can't be changed."); + return -EINVAL; + } + mp->m_features &= ~XFS_FEAT_ATTR2; + mp->m_features |= XFS_FEAT_NOATTR2; + } else if (xfs_has_attr2(new_mp)) { + /* noattr2 -> attr2 */ + mp->m_features &= ~XFS_FEAT_NOATTR2; + mp->m_features |= XFS_FEAT_ATTR2; + } + + /* Validate new max_atomic_write option before making other changes */ + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { + error = xfs_set_max_atomic_write_opt(mp, + new_mp->m_awu_max_bytes); + if (error) + return error; + } + /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; @@ -1966,6 +2213,17 @@ xfs_fs_reconfigure( mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } + /* + * Now that mp has been modified according to the remount options, we + * do a final option validation with xfs_finish_flags() just like it is + * just like it is done during mount. We cannot use + * done during mount. We cannot use xfs_finish_flags() on new_mp as it + * contains only the user given options. + */ + error = xfs_finish_flags(mp); + if (error) + return error; + /* ro -> rw */ if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { error = xfs_remount_rw(mp); @@ -2011,18 +2269,22 @@ static const struct fs_context_operations xfs_context_ops = { * mount option parsing having already been performed as this can be called from * fsopen() before any parameters have been set. */ -static int xfs_init_fs_context( +static int +xfs_init_fs_context( struct fs_context *fc) { struct xfs_mount *mp; + int i; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); if (!mp) return -ENOMEM; spin_lock_init(&mp->m_sb_lock); - xa_init(&mp->m_perags); + for (i = 0; i < XG_TYPE_MAX; i++) + xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; @@ -2063,7 +2325,8 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 302e6e5d6c7e..c0e85c1e42f2 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -92,7 +92,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, extern const struct export_operations xfs_export_operations; extern const struct quotactl_ops xfs_quotactl_operations; -extern const struct dax_holder_operations xfs_dax_holder_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index c84df23b494d..751dc74a3067 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -66,7 +66,7 @@ xfs_deprecated_dointvec_minmax( return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); } -static struct ctl_table xfs_table[] = { +static const struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 276696a07040..51646f066c4f 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -29,8 +29,6 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ - xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ - xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 60cb5318fdae..7a5c5ef2db92 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -13,6 +13,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" +#include "xfs_zones.h" struct xfs_sysfs_attr { struct attribute attr; @@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -const struct kobj_type xfs_mp_ktype = { +static const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -568,8 +569,8 @@ retry_timeout_seconds_store( if (val == -1) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else { - cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); - ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); + cfg->retry_timeout = secs_to_jiffies(val); + ASSERT(secs_to_jiffies(val) < LONG_MAX); } return count; } @@ -686,8 +687,8 @@ xfs_error_sysfs_init_class( if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else - cfg->retry_timeout = msecs_to_jiffies( - init[i].retry_timeout * MSEC_PER_SEC); + cfg->retry_timeout = + secs_to_jiffies(init[i].retry_timeout); } return 0; @@ -701,45 +702,135 @@ out_error: return error; } +static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) +{ + return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); +} + +static ssize_t +max_open_zones_show( + struct kobject *kobj, + char *buf) +{ + /* only report the open zones available for user data */ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); +} +XFS_SYSFS_ATTR_RO(max_open_zones); + +static ssize_t +zonegc_low_space_store( + struct kobject *kobj, + const char *buf, + size_t count) +{ + int ret; + unsigned int val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + zoned_to_mp(kobj)->m_zonegc_low_space = val; + + return count; +} + +static ssize_t +zonegc_low_space_show( + struct kobject *kobj, + char *buf) +{ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_zonegc_low_space); +} +XFS_SYSFS_ATTR_RW(zonegc_low_space); + +static struct attribute *xfs_zoned_attrs[] = { + ATTR_LIST(max_open_zones), + ATTR_LIST(zonegc_low_space), + NULL, +}; +ATTRIBUTE_GROUPS(xfs_zoned); + +static const struct kobj_type xfs_zoned_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_zoned_groups, +}; + int -xfs_error_sysfs_init( +xfs_mount_sysfs_init( struct xfs_mount *mp) { int error; + super_set_sysfs_name_id(mp->m_super); + + /* .../xfs/<dev>/ */ + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); + if (error) + return error; + + /* .../xfs/<dev>/stats/ */ + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); + if (error) + goto out_remove_fsdir; + /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) - return error; + goto out_remove_stats_dir; + /* .../xfs/<dev>/error/fail_at_unmount */ error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) - goto out_error; + goto out_remove_error_dir; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) - goto out_error; + goto out_remove_error_dir; + + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { + /* .../xfs/<dev>/zoned/ */ + error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, + &mp->m_kobj, "zoned"); + if (error) + goto out_remove_error_dir; + } return 0; -out_error: +out_remove_error_dir: xfs_sysfs_del(&mp->m_error_kobj); +out_remove_stats_dir: + xfs_sysfs_del(&mp->m_stats.xs_kobj); +out_remove_fsdir: + xfs_sysfs_del(&mp->m_kobj); return error; } void -xfs_error_sysfs_del( +xfs_mount_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + xfs_sysfs_del(&mp->m_zoned_kobj); + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; @@ -749,6 +840,8 @@ xfs_error_sysfs_del( } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); + xfs_sysfs_del(&mp->m_stats.xs_kobj); + xfs_sysfs_del(&mp->m_kobj); } struct xfs_error_cfg * diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 148893ebfdef..1622fe80ad3e 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,7 +7,6 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern const struct kobj_type xfs_dbg_ktype; /* debug */ extern const struct kobj_type xfs_log_ktype; /* xlog */ extern const struct kobj_type xfs_stats_ktype; /* stats */ @@ -53,7 +52,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } -int xfs_error_sysfs_init(struct xfs_mount *mp); -void xfs_error_sysfs_del(struct xfs_mount *mp); +int xfs_mount_sysfs_init(struct xfs_mount *mp); +void xfs_mount_sysfs_del(struct xfs_mount *mp); #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 2af9f274e872..a60556dbd172 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -11,6 +11,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_group.h" #include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -32,6 +33,7 @@ #include "xfs_fsmap.h" #include "xfs_btree_staging.h" #include "xfs_icache.h" +#include "xfs_iunlink_item.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_error.h" @@ -44,6 +46,11 @@ #include "xfs_parent.h" #include "xfs_rmap.h" #include "xfs_refcount.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ee9f0b1f548d..01d284a1c759 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -14,11 +14,15 @@ * ino: filesystem inode number * * agbno: per-AG block number in fs blocks + * rgbno: per-rtgroup block number in fs blocks * startblock: physical block number for file mappings. This is either a * segmented fsblock for data device mappings, or a rfsblock * for realtime device mappings * fsbcount: number of blocks in an extent, in fs blocks * + * gbno: generic allocation group block number. This is an agbno for + * space in a per-AG or a rgbno for space in a realtime group. + * * daddr: physical block number in 512b blocks * bbcount: number of blocks in a physical extent, in 512b blocks * @@ -72,8 +76,11 @@ struct xfs_btree_cur; struct xfs_defer_op_type; struct xfs_refcount_irec; struct xfs_fsmap; +struct xfs_fsmap_irec; +struct xfs_group; struct xfs_rmap_irec; struct xfs_icreate_log; +struct xfs_iunlink_item; struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; @@ -93,6 +100,9 @@ struct xfs_attrlist_cursor_kern; struct xfs_extent_free_item; struct xfs_rmap_intent; struct xfs_refcount_intent; +struct xfs_metadir_update; +struct xfs_rtgroup; +struct xfs_open_zone; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -160,6 +170,99 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); +TRACE_EVENT(xfs_calc_atomic_write_unit_max, + TP_PROTO(struct xfs_mount *mp, unsigned int max_write, + unsigned int max_ioend, unsigned int max_agsize, + unsigned int max_rgsize), + TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, max_write) + __field(unsigned int, max_ioend) + __field(unsigned int, max_agsize) + __field(unsigned int, max_rgsize) + __field(unsigned int, data_awu_max) + __field(unsigned int, rt_awu_max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->max_write = max_write; + __entry->max_ioend = max_ioend; + __entry->max_agsize = max_agsize; + __entry->max_rgsize = max_rgsize; + __entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max; + __entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max; + ), + TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->max_write, + __entry->max_ioend, + __entry->max_agsize, + __entry->max_rgsize, + __entry->data_awu_max, + __entry->rt_awu_max) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int logres, + unsigned int blockcount), + TP_ARGS(mp, per_intent, step_size, logres, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, logres) + __field(unsigned int, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->logres = logres; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->logres, + __entry->blockcount) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int blockcount, + unsigned int min_logblocks, unsigned int logres), + TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, blockcount) + __field(unsigned int, min_logblocks) + __field(unsigned int, cur_logblocks) + __field(unsigned int, logres) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->blockcount = blockcount; + __entry->min_logblocks = min_logblocks; + __entry->cur_logblocks = mp->m_sb.sb_logblocks; + __entry->logres = logres; + ), + TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->blockcount, + __entry->min_logblocks, + __entry->cur_logblocks, + __entry->logres) +); + TRACE_EVENT(xlog_intent_recovery_failed, TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, int error), @@ -181,7 +284,7 @@ TRACE_EVENT(xlog_intent_recovery_failed, ); DECLARE_EVENT_CLASS(xfs_perag_class, - TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), + TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip), TP_ARGS(pag, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) @@ -191,10 +294,11 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; - __entry->refcount = atomic_read(&pag->pag_ref); - __entry->active_refcount = atomic_read(&pag->pag_active_ref); + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->refcount = atomic_read(&pag->pag_group.xg_ref); + __entry->active_refcount = + atomic_read(&pag->pag_group.xg_active_ref); __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS", @@ -207,18 +311,200 @@ DECLARE_EVENT_CLASS(xfs_perag_class, #define DEFINE_PERAG_REF_EVENT(name) \ DEFINE_EVENT(xfs_perag_class, name, \ - TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \ + TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip), \ TP_ARGS(pag, caller_ip)) -DEFINE_PERAG_REF_EVENT(xfs_perag_get); -DEFINE_PERAG_REF_EVENT(xfs_perag_hold); -DEFINE_PERAG_REF_EVENT(xfs_perag_put); -DEFINE_PERAG_REF_EVENT(xfs_perag_grab); -DEFINE_PERAG_REF_EVENT(xfs_perag_grab_next_tag); -DEFINE_PERAG_REF_EVENT(xfs_perag_rele); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count); +TRACE_DEFINE_ENUM(XG_TYPE_AG); +TRACE_DEFINE_ENUM(XG_TYPE_RTG); + +DECLARE_EVENT_CLASS(xfs_group_class, + TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), + TP_ARGS(xg, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_group_type, type) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(int, active_refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; + __entry->refcount = atomic_read(&xg->xg_ref); + __entry->active_refcount = atomic_read(&xg->xg_active_ref); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d %sno 0x%x passive refs %d active refs %d caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->agno, + __entry->refcount, + __entry->active_refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_GROUP_REF_EVENT(name) \ +DEFINE_EVENT(xfs_group_class, name, \ + TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \ + TP_ARGS(xg, caller_ip)) +DEFINE_GROUP_REF_EVENT(xfs_group_get); +DEFINE_GROUP_REF_EVENT(xfs_group_hold); +DEFINE_GROUP_REF_EVENT(xfs_group_put); +DEFINE_GROUP_REF_EVENT(xfs_group_grab); +DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); +DEFINE_GROUP_REF_EVENT(xfs_group_rele); + +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xfs_zone_class, + TP_PROTO(struct xfs_rtgroup *rtg), + TP_ARGS(rtg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, nr_open) + ), + TP_fast_assign( + struct xfs_mount *mp = rtg_mount(rtg); + + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->nr_open = mp->m_zone_info->zi_nr_open_zones; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->nr_open) +); + +#define DEFINE_ZONE_EVENT(name) \ +DEFINE_EVENT(xfs_zone_class, name, \ + TP_PROTO(struct xfs_rtgroup *rtg), \ + TP_ARGS(rtg)) +DEFINE_ZONE_EVENT(xfs_zone_emptied); +DEFINE_ZONE_EVENT(xfs_zone_full); +DEFINE_ZONE_EVENT(xfs_zone_opened); +DEFINE_ZONE_EVENT(xfs_zone_reset); +DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); + +TRACE_EVENT(xfs_zone_free_blocks, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(rtg, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->rgbno, + __entry->len) +); + +DECLARE_EVENT_CLASS(xfs_zone_alloc_class, + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(oz, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, written) + __field(xfs_rgblock_t, write_pointer) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(oz->oz_rtg); + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->written = oz->oz_written; + __entry->write_pointer = oz->oz_write_pointer; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->written, + __entry->write_pointer, + __entry->rgbno, + __entry->len) +); + +#define DEFINE_ZONE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_zone_alloc_class, name, \ + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \ + xfs_extlen_t len), \ + TP_ARGS(oz, rgbno, len)) +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); + +TRACE_EVENT(xfs_zone_gc_select_victim, + TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), + TP_ARGS(rtg, bucket), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, bucket) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->bucket = bucket; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->bucket) +); + +TRACE_EVENT(xfs_zones_mount, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgcount) + __field(uint32_t, blocks) + __field(unsigned int, max_open_zones) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgcount = mp->m_sb.sb_rgcount; + __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks; + __entry->max_open_zones = mp->m_max_open_zones; + ), + TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgcount, + __entry->blocks, + __entry->max_open_zones) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), TP_ARGS(mp, shrinker_hits), @@ -299,15 +585,15 @@ TRACE_EVENT(xfs_inodegc_shrinker_scan, ); DECLARE_EVENT_CLASS(xfs_ag_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), - TP_ARGS(mp, agno), + TP_PROTO(const struct xfs_perag *pag), + TP_ARGS(pag), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); ), TP_printk("dev %d:%d agno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -315,8 +601,8 @@ DECLARE_EVENT_CLASS(xfs_ag_class, ); #define DEFINE_AG_EVENT(name) \ DEFINE_EVENT(xfs_ag_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ - TP_ARGS(mp, agno)) + TP_PROTO(const struct xfs_perag *pag), \ + TP_ARGS(pag)) DEFINE_AG_EVENT(xfs_read_agf); DEFINE_AG_EVENT(xfs_alloc_read_agf); @@ -452,7 +738,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->dev = bp->b_target->bt_dev; __entry->bno = xfs_buf_daddr(bp); __entry->nblks = bp->b_length; - __entry->hold = atomic_read(&bp->b_hold); + __entry->hold = bp->b_hold; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; @@ -499,6 +785,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); +DEFINE_BUF_EVENT(xfs_buf_backing_folio); +DEFINE_BUF_EVENT(xfs_buf_backing_kmem); +DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc); +DEFINE_BUF_EVENT(xfs_buf_backing_fallback); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); @@ -523,7 +813,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->bno = xfs_buf_daddr(bp); __entry->length = bp->b_length; __entry->flags = flags; - __entry->hold = atomic_read(&bp->b_hold); + __entry->hold = bp->b_hold; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->caller_ip = caller_ip; @@ -547,6 +837,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \ DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead); TRACE_EVENT(xfs_buf_ioerror, TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip), @@ -566,7 +857,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->dev = bp->b_target->bt_dev; __entry->bno = xfs_buf_daddr(bp); __entry->length = bp->b_length; - __entry->hold = atomic_read(&bp->b_hold); + __entry->hold = bp->b_hold; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->error = error; @@ -610,7 +901,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->buf_bno = xfs_buf_daddr(bip->bli_buf); __entry->buf_len = bip->bli_buf->b_length; __entry->buf_flags = bip->bli_buf->b_flags; - __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_hold = bip->bli_buf->b_hold; __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; __entry->li_flags = bip->bli_item.li_flags; @@ -662,7 +953,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), + TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) @@ -671,9 +962,9 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->dev = pag_mount(pag)->m_super->s_dev; __entry->ino = ino; - __entry->agno = pag->pag_agno; + __entry->agno = pag_agno(pag); __entry->streams = atomic_read(&pag->pagf_fstrms); ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d", @@ -684,15 +975,15 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), \ + TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), \ TP_ARGS(pag, ino)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); TRACE_EVENT(xfs_filestream_pick, - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free), - TP_ARGS(pag, ino, free), + TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), + TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -701,16 +992,11 @@ TRACE_EVENT(xfs_filestream_pick, __field(xfs_extlen_t, free) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->dev = pag_mount(pag)->m_super->s_dev; __entry->ino = ino; - if (pag) { - __entry->agno = pag->pag_agno; - __entry->streams = atomic_read(&pag->pagf_fstrms); - } else { - __entry->agno = NULLAGNUMBER; - __entry->streams = 0; - } - __entry->free = free; + __entry->agno = pag_agno(pag); + __entry->streams = atomic_read(&pag->pagf_fstrms); + __entry->free = pag->pagf_freeblks; ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -827,28 +1113,32 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); -TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), - TP_ARGS(ip, order, write_fault), +DECLARE_EVENT_CLASS(xfs_fault_class, + TP_PROTO(struct xfs_inode *ip, unsigned int order), + TP_ARGS(ip, order), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(unsigned int, order) - __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->order = order; - __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->order, - __entry->write_fault) + __entry->order) ) +#define DEFINE_FAULT_EVENT(name) \ +DEFINE_EVENT(xfs_fault_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int order), \ + TP_ARGS(ip, order)) +DEFINE_FAULT_EVENT(xfs_read_fault); +DEFINE_FAULT_EVENT(xfs_write_fault); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -899,9 +1189,10 @@ TRACE_EVENT(xfs_iomap_prealloc_size, ) TRACE_EVENT(xfs_irec_merge_pre, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, - uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), - TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_inobt_rec_incore *rec, + const struct xfs_inobt_rec_incore *nrec), + TP_ARGS(pag, rec, nrec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -911,12 +1202,12 @@ TRACE_EVENT(xfs_irec_merge_pre, __field(uint16_t, nholemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agino = agino; - __entry->holemask = holemask; - __entry->nagino = nagino; - __entry->nholemask = holemask; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->nagino = nrec->ir_startino; + __entry->nholemask = nrec->ir_holemask; ), TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -928,9 +1219,9 @@ TRACE_EVENT(xfs_irec_merge_pre, ) TRACE_EVENT(xfs_irec_merge_post, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, - uint16_t holemask), - TP_ARGS(mp, agno, agino, holemask), + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_inobt_rec_incore *nrec), + TP_ARGS(pag, nrec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -938,10 +1229,10 @@ TRACE_EVENT(xfs_irec_merge_post, __field(uint16_t, holemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agino = agino; - __entry->holemask = holemask; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = nrec->ir_startino; + __entry->holemask = nrec->ir_holemask; ), TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x", MAJOR(__entry->dev), @@ -1459,6 +1750,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); +TRACE_EVENT(xfs_iomap_atomic_write_cow, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_off_t, offset) + __field(ssize_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->offset, + __entry->count) +) + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1549,6 +1862,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); +DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -1639,44 +1953,48 @@ TRACE_EVENT(xfs_bunmap, ); DECLARE_EVENT_CLASS(xfs_extent_busy_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len) ); #define DEFINE_BUSY_EVENT(name) \ DEFINE_EVENT(xfs_extent_busy_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(xg, agbno, len)) DEFINE_BUSY_EVENT(xfs_extent_busy); DEFINE_BUSY_EVENT(xfs_extent_busy_force); DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); DEFINE_BUSY_EVENT(xfs_extent_busy_clear); TRACE_EVENT(xfs_extent_busy_trim, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - xfs_agblock_t tbno, xfs_extlen_t tlen), - TP_ARGS(mp, agno, agbno, len, tbno, tlen), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(xg, agbno, len, tbno, tlen), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) @@ -1684,22 +2002,99 @@ TRACE_EVENT(xfs_extent_busy_trim, __field(xfs_extlen_t, tlen) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->tbno = tbno; __entry->tlen = tlen; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->tbno, __entry->tlen) ); +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xfs_rtalloc_extent_busy, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t start, + xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, + xfs_rtxlen_t len, xfs_rtxlen_t prod, xfs_rtxnum_t rtx, + unsigned busy_gen), + TP_ARGS(rtg, start, minlen, maxlen, len, prod, rtx, busy_gen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rtxnum_t, start) + __field(xfs_rtxlen_t, minlen) + __field(xfs_rtxlen_t, maxlen) + __field(xfs_rtxlen_t, mod) + __field(xfs_rtxlen_t, prod) + __field(xfs_rtxlen_t, len) + __field(xfs_rtxnum_t, rtx) + __field(unsigned, busy_gen) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->start = start; + __entry->minlen = minlen; + __entry->maxlen = maxlen; + __entry->prod = prod; + __entry->len = len; + __entry->rtx = rtx; + __entry->busy_gen = busy_gen; + ), + TP_printk("dev %d:%d rgno 0x%x startrtx 0x%llx minlen %u maxlen %u " + "prod %u len %u rtx 0%llx busy_gen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->start, + __entry->minlen, + __entry->maxlen, + __entry->prod, + __entry->len, + __entry->rtx, + __entry->busy_gen) +) + +TRACE_EVENT(xfs_rtalloc_extent_busy_trim, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t old_rtx, + xfs_rtxlen_t old_len, xfs_rtxnum_t new_rtx, + xfs_rtxlen_t new_len), + TP_ARGS(rtg, old_rtx, old_len, new_rtx, new_len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rtxnum_t, old_rtx) + __field(xfs_rtxnum_t, new_rtx) + __field(xfs_rtxlen_t, old_len) + __field(xfs_rtxlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->old_rtx = old_rtx; + __entry->old_len = old_len; + __entry->new_rtx = new_rtx; + __entry->new_len = new_len; + ), + TP_printk("dev %d:%d rgno 0x%x rtx 0x%llx rtxcount 0x%x -> rtx 0x%llx rtxcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->old_rtx, + __entry->old_len, + __entry->new_rtx, + __entry->new_len) +); +#endif /* CONFIG_XFS_RT */ + DECLARE_EVENT_CLASS(xfs_agf_class, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), @@ -1763,10 +2158,10 @@ DEFINE_AGF_EVENT(xfs_agf); DEFINE_AGF_EVENT(xfs_agfl_reset); TRACE_EVENT(xfs_free_extent, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, int haveright), - TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright), + TP_ARGS(pag, agbno, len, resv, haveleft, haveright), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1777,8 +2172,8 @@ TRACE_EVENT(xfs_free_extent, __field(int, haveright) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; __entry->resv = resv; @@ -2172,6 +2567,7 @@ TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL); TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS); TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE); TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_META_BTREE); DECLARE_EVENT_CLASS(xfs_swap_extent_class, TP_PROTO(struct xfs_inode *ip, int which), @@ -2431,23 +2827,26 @@ DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel); DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover); DECLARE_EVENT_CLASS(xfs_discard_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __entry->agbno, __entry->len) @@ -2455,9 +2854,9 @@ DECLARE_EVENT_CLASS(xfs_discard_class, #define DEFINE_DISCARD_EVENT(name) \ DEFINE_EVENT(xfs_discard_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(xg, agbno, len)) DEFINE_DISCARD_EVENT(xfs_discard_extent); DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); @@ -2547,7 +2946,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->ino = cur->bc_ino.ip->i_ino; break; case XFS_BTREE_TYPE_AG: - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->ino = 0; break; case XFS_BTREE_TYPE_MEM: @@ -2717,6 +3116,7 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, TP_ARGS(mp, free), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) @@ -2724,13 +3124,16 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); - __entry->agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + __entry->type = free->xefi_group->xg_type; + __entry->agno = free->xefi_group->xg_gno; + __entry->agbno = xfs_fsb_to_gbno(mp, free->xefi_startblock, + free->xefi_group->xg_type); __entry->len = free->xefi_blockcount; __entry->flags = free->xefi_flags; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __entry->agbno, __entry->len, @@ -2740,7 +3143,6 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, DEFINE_EVENT(xfs_free_extent_deferred_class, name, \ TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \ TP_ARGS(mp, free)) -DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred); @@ -2789,13 +3191,14 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item); /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_btree_cur *cur, - xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, + xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, const struct xfs_owner_info *oinfo), - TP_ARGS(cur, agbno, len, unwritten, oinfo), + TP_ARGS(cur, gbno, len, unwritten, oinfo), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_extlen_t, len) __field(uint64_t, owner) __field(uint64_t, offset) @@ -2803,8 +3206,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; - __entry->agbno = agbno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; + __entry->gbno = gbno; __entry->len = len; __entry->owner = oinfo->oi_owner; __entry->offset = oinfo->oi_offset; @@ -2812,10 +3216,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, if (unwritten) __entry->flags |= XFS_RMAP_UNWRITTEN; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->len, __entry->owner, __entry->offset, @@ -2824,9 +3229,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, #define DEFINE_RMAP_EVENT(name) \ DEFINE_EVENT(xfs_rmap_class, name, \ TP_PROTO(struct xfs_btree_cur *cur, \ - xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ + xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, \ const struct xfs_owner_info *oinfo), \ - TP_ARGS(cur, agbno, len, unwritten, oinfo)) + TP_ARGS(cur, gbno, len, unwritten, oinfo)) /* btree cursor error/%ip tracepoint class */ DECLARE_EVENT_CLASS(xfs_btree_error_class, @@ -2848,7 +3253,7 @@ DECLARE_EVENT_CLASS(xfs_btree_error_class, __entry->ino = cur->bc_ino.ip->i_ino; break; case XFS_BTREE_TYPE_AG: - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->ino = 0; break; case XFS_BTREE_TYPE_MEM: @@ -2889,47 +3294,36 @@ TRACE_EVENT(xfs_rmap_convert_state, TP_ARGS(cur, state, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_ino_t, ino) __field(int, state) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - switch (cur->bc_ops->type) { - case XFS_BTREE_TYPE_INODE: - __entry->agno = 0; - __entry->ino = cur->bc_ino.ip->i_ino; - break; - case XFS_BTREE_TYPE_AG: - __entry->agno = cur->bc_ag.pag->pag_agno; - __entry->ino = 0; - break; - case XFS_BTREE_TYPE_MEM: - __entry->agno = 0; - __entry->ino = 0; - break; - } + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; __entry->state = state; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS", + TP_printk("dev %d:%d %sno 0x%x state %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->ino, __entry->state, (char *)__entry->caller_ip) ); DECLARE_EVENT_CLASS(xfs_rmapbt_class, TP_PROTO(struct xfs_btree_cur *cur, - xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t gbno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags), - TP_ARGS(cur, agbno, len, owner, offset, flags), + TP_ARGS(cur, gbno, len, owner, offset, flags), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_extlen_t, len) __field(uint64_t, owner) __field(uint64_t, offset) @@ -2937,17 +3331,19 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; - __entry->agbno = agbno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; + __entry->gbno = gbno; __entry->len = len; __entry->owner = owner; __entry->offset = offset; __entry->flags = flags; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->len, __entry->owner, __entry->offset, @@ -2956,9 +3352,9 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, #define DEFINE_RMAPBT_EVENT(name) \ DEFINE_EVENT(xfs_rmapbt_class, name, \ TP_PROTO(struct xfs_btree_cur *cur, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ + xfs_agblock_t gbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ - TP_ARGS(cur, agbno, len, owner, offset, flags)) + TP_ARGS(cur, gbno, len, owner, offset, flags)) TRACE_DEFINE_ENUM(XFS_RMAP_MAP); TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); @@ -2975,8 +3371,9 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long long, owner) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(int, whichfork) __field(xfs_fileoff_t, l_loff) __field(xfs_filblks_t, l_len) @@ -2985,9 +3382,11 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); - __entry->agbno = XFS_FSB_TO_AGBNO(mp, - ri->ri_bmap.br_startblock); + __entry->type = ri->ri_group->xg_type; + __entry->agno = ri->ri_group->xg_gno; + __entry->gbno = xfs_fsb_to_gbno(mp, + ri->ri_bmap.br_startblock, + ri->ri_group->xg_type); __entry->owner = ri->ri_owner; __entry->whichfork = ri->ri_whichfork; __entry->l_loff = ri->ri_bmap.br_startoff; @@ -2995,11 +3394,12 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, __entry->l_state = ri->ri_bmap.br_state; __entry->op = ri->ri_type; ), - TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->owner, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, @@ -3037,11 +3437,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, TP_ARGS(bi), TP_STRUCT__entry( __field(dev_t, dev) - __field(dev_t, opdev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_ino_t, ino) - __field(xfs_agblock_t, agbno) - __field(xfs_fsblock_t, rtbno) + __field(unsigned long long, gbno) __field(int, whichfork) __field(xfs_fileoff_t, l_loff) __field(xfs_filblks_t, l_len) @@ -3050,20 +3449,25 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, ), TP_fast_assign( struct xfs_inode *ip = bi->bi_owner; + struct xfs_mount *mp = ip->i_mount; - __entry->dev = ip->i_mount->m_super->s_dev; - if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) { - __entry->agno = 0; - __entry->agbno = 0; - __entry->rtbno = bi->bi_bmap.br_startblock; - __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev; + __entry->dev = mp->m_super->s_dev; + __entry->type = bi->bi_group->xg_type; + __entry->agno = bi->bi_group->xg_gno; + if (bi->bi_group->xg_type == XG_TYPE_RTG && + !xfs_has_rtgroups(mp)) { + /* + * Legacy rt filesystems do not have allocation groups + * ondisk. We emulate this incore with one gigantic + * rtgroup whose size can exceed a 32-bit block number. + * For this tracepoint, we report group 0 and a 64-bit + * group block number. + */ + __entry->gbno = bi->bi_bmap.br_startblock; } else { - __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount, - bi->bi_bmap.br_startblock); - __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount, - bi->bi_bmap.br_startblock); - __entry->rtbno = 0; - __entry->opdev = __entry->dev; + __entry->gbno = xfs_fsb_to_gbno(mp, + bi->bi_bmap.br_startblock, + bi->bi_group->xg_type); } __entry->ino = ip->i_ino; __entry->whichfork = bi->bi_whichfork; @@ -3072,14 +3476,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, __entry->l_state = bi->bi_bmap.br_state; __entry->op = bi->bi_type; ), - TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + TP_printk("dev %d:%d op %s ino 0x%llx %sno 0x%x gbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS), - MAJOR(__entry->opdev), MINOR(__entry->opdev), __entry->ino, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, - __entry->rtbno, + __entry->gbno, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, @@ -3110,8 +3513,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, TP_fast_assign( struct xfs_ag_resv *r = xfs_perag_resv(pag, resv); - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->resv = resv; __entry->freeblks = pag->pagf_freeblks; __entry->flcount = pag->pagf_flcount; @@ -3144,11 +3547,10 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); -/* simple AG-based error/%ip tracepoint class */ -DECLARE_EVENT_CLASS(xfs_ag_error_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, +TRACE_EVENT(xfs_ag_resv_init_error, + TP_PROTO(const struct xfs_perag *pag, int error, unsigned long caller_ip), - TP_ARGS(mp, agno, error, caller_ip), + TP_ARGS(pag, error, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3156,8 +3558,8 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->error = error; __entry->caller_ip = caller_ip; ), @@ -3168,66 +3570,65 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, (char *)__entry->caller_ip) ); -#define DEFINE_AG_ERROR_EVENT(name) \ -DEFINE_EVENT(xfs_ag_error_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \ - unsigned long caller_ip), \ - TP_ARGS(mp, agno, error, caller_ip)) -DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); - /* refcount tracepoint classes */ DECLARE_EVENT_CLASS(xfs_refcount_class, - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, xfs_extlen_t len), - TP_ARGS(cur, agbno, len), + TP_ARGS(cur, gbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; - __entry->agbno = agbno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; + __entry->gbno = gbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->len) ); #define DEFINE_REFCOUNT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_class, name, \ - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \ + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \ xfs_extlen_t len), \ - TP_ARGS(cur, agbno, len)) + TP_ARGS(cur, gbno, len)) TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi); TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi); TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi); TRACE_EVENT(xfs_refcount_lookup, - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, xfs_lookup_t dir), - TP_ARGS(cur, agbno, dir), + TP_ARGS(cur, gbno, dir), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_lookup_t, dir) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; - __entry->agbno = agbno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; + __entry->gbno = gbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), __entry->dir) ) @@ -3238,6 +3639,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_ARGS(cur, irec), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) @@ -3246,14 +3648,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, @@ -3269,49 +3673,53 @@ DEFINE_EVENT(xfs_refcount_extent_class, name, \ /* single-rcext and an agbno tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, - xfs_agblock_t agbno), - TP_ARGS(cur, irec, agbno), + xfs_agblock_t gbno), + TP_ARGS(cur, irec, gbno), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; - __entry->agbno = agbno; + __entry->gbno = gbno; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount, - __entry->agbno) + __entry->gbno) ) #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \ - xfs_agblock_t agbno), \ - TP_ARGS(cur, irec, agbno)) + xfs_agblock_t gbno), \ + TP_ARGS(cur, irec, gbno)) /* double-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, - struct xfs_refcount_irec *i2), + struct xfs_refcount_irec *i2), TP_ARGS(cur, i1, i2), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) @@ -3324,7 +3732,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3334,9 +3743,10 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, @@ -3357,10 +3767,11 @@ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ /* double-rcext and an agbno tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, - struct xfs_refcount_irec *i2, xfs_agblock_t agbno), - TP_ARGS(cur, i1, i2, agbno), + struct xfs_refcount_irec *i2, xfs_agblock_t gbno), + TP_ARGS(cur, i1, i2, gbno), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) @@ -3370,11 +3781,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3383,11 +3795,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; - __entry->agbno = agbno; + __entry->gbno = gbno; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, @@ -3397,14 +3810,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, - __entry->agbno) + __entry->gbno) ) #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \ - struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \ - TP_ARGS(cur, i1, i2, agbno)) + struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \ + TP_ARGS(cur, i1, i2, gbno)) /* triple-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, @@ -3413,6 +3826,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_ARGS(cur, i1, i2, i3), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) @@ -3429,7 +3843,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->type = cur->bc_group->xg_type; + __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3443,10 +3858,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, @@ -3514,23 +3930,27 @@ DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, TP_ARGS(mp, refc), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(int, op) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock); + __entry->type = refc->ri_group->xg_type; + __entry->agno = refc->ri_group->xg_gno; __entry->op = refc->ri_type; - __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock); + __entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock, + refc->ri_group->xg_type); __entry->len = refc->ri_blockcount; ), - TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->len) ); #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \ @@ -3830,6 +4250,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -3843,7 +4264,45 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); /* fsmap traces */ -DECLARE_EVENT_CLASS(xfs_fsmap_class, +TRACE_EVENT(xfs_fsmap_mapping, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, + const struct xfs_fsmap_irec *frec), + TP_ARGS(mp, keydev, agno, frec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, len_daddr) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->agbno = frec->rec_key; + __entry->start_daddr = frec->start_daddr; + __entry->len_daddr = frec->len_daddr; + __entry->owner = frec->owner; + __entry->offset = frec->offset; + __entry->flags = frec->rm_flags; + ), + TP_printk("dev %d:%d keydev %d:%d agno 0x%x gbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->agbno, + __entry->start_daddr, + __entry->len_daddr, + __entry->owner, + __entry->offset, + __entry->flags) +); + +DECLARE_EVENT_CLASS(xfs_fsmap_group_key_class, TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, const struct xfs_rmap_irec *rmap), TP_ARGS(mp, keydev, agno, rmap), @@ -3851,8 +4310,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __field(dev_t, dev) __field(dev_t, keydev) __field(xfs_agnumber_t, agno) - __field(xfs_fsblock_t, bno) - __field(xfs_filblks_t, len) + __field(xfs_agblock_t, agbno) __field(uint64_t, owner) __field(uint64_t, offset) __field(unsigned int, flags) @@ -3861,33 +4319,30 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __entry->dev = mp->m_super->s_dev; __entry->keydev = new_decode_dev(keydev); __entry->agno = agno; - __entry->bno = rmap->rm_startblock; - __entry->len = rmap->rm_blockcount; + __entry->agbno = rmap->rm_startblock; __entry->owner = rmap->rm_owner; __entry->offset = rmap->rm_offset; __entry->flags = rmap->rm_flags; ), - TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", + TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, - __entry->bno, - __entry->len, + __entry->agbno, __entry->owner, __entry->offset, __entry->flags) ) -#define DEFINE_FSMAP_EVENT(name) \ -DEFINE_EVENT(xfs_fsmap_class, name, \ +#define DEFINE_FSMAP_GROUP_KEY_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_group_key_class, name, \ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ const struct xfs_rmap_irec *rmap), \ TP_ARGS(mp, keydev, agno, rmap)) -DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); -DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); -DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); +DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_low_group_key); +DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_high_group_key); -DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, - TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), +DECLARE_EVENT_CLASS(xfs_fsmap_linear_key_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_fsblock_t bno), TP_ARGS(mp, keydev, bno), TP_STRUCT__entry( __field(dev_t, dev) @@ -3904,12 +4359,12 @@ DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->bno) ) -#define DEFINE_FSMAP_LINEAR_EVENT(name) \ -DEFINE_EVENT(xfs_fsmap_linear_class, name, \ +#define DEFINE_FSMAP_LINEAR_KEY_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_linear_key_class, name, \ TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \ TP_ARGS(mp, keydev, bno)) -DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear); -DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear); +DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_low_linear_key); +DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_high_linear_key); DECLARE_EVENT_CLASS(xfs_getfsmap_class, TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), @@ -4041,9 +4496,9 @@ DEFINE_TRANS_EVENT(xfs_trans_commit_items); DEFINE_TRANS_EVENT(xfs_trans_free_items); TRACE_EVENT(xfs_iunlink_update_bucket, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t old_ptr, xfs_agino_t new_ptr), - TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_ARGS(pag, bucket, old_ptr, new_ptr), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -4052,8 +4507,8 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __field(xfs_agino_t, new_ptr) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->old_ptr = old_ptr; __entry->new_ptr = new_ptr; @@ -4067,9 +4522,8 @@ TRACE_EVENT(xfs_iunlink_update_bucket, ); TRACE_EVENT(xfs_iunlink_update_dinode, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, - xfs_agino_t old_ptr, xfs_agino_t new_ptr), - TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_PROTO(const struct xfs_iunlink_item *iup, xfs_agino_t old_ptr), + TP_ARGS(iup, old_ptr), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -4078,11 +4532,12 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __field(xfs_agino_t, new_ptr) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agino = agino; + __entry->dev = pag_mount(iup->pag)->m_super->s_dev; + __entry->agno = pag_agno(iup->pag); + __entry->agino = + XFS_INO_TO_AGINO(iup->ip->i_mount, iup->ip->i_ino); __entry->old_ptr = old_ptr; - __entry->new_ptr = new_ptr; + __entry->new_ptr = iup->next_agino; ), TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -4185,37 +4640,35 @@ DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick); DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt); DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy); DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption); -DECLARE_EVENT_CLASS(xfs_ag_corrupt_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags), - TP_ARGS(mp, agno, flags), +DECLARE_EVENT_CLASS(xfs_group_corrupt_class, + TP_PROTO(const struct xfs_group *xg, unsigned int flags), + TP_ARGS(xg, flags), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_agnumber_t, agno) + __field(enum xfs_group_type, type) + __field(uint32_t, index) __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->index = xg->xg_gno; __entry->flags = flags; ), - TP_printk("dev %d:%d agno 0x%x flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, __entry->flags) + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->index, __entry->flags) ); -#define DEFINE_AG_CORRUPT_EVENT(name) \ -DEFINE_EVENT(xfs_ag_corrupt_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - unsigned int flags), \ - TP_ARGS(mp, agno, flags)) -DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick); -DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt); -DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy); -DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption); +#define DEFINE_GROUP_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_group_corrupt_class, name, \ + TP_PROTO(const struct xfs_group *xg, unsigned int flags), \ + TP_ARGS(xg, flags)) +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_sick); +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_corrupt); +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_healthy); +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_unfixed_corruption); DECLARE_EVENT_CLASS(xfs_inode_corrupt_class, TP_PROTO(struct xfs_inode *ip, unsigned int flags), @@ -4243,29 +4696,10 @@ DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption); -TRACE_EVENT(xfs_iwalk_ag, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino), - TP_ARGS(mp, agno, startino), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->startino = startino; - ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->startino) -) - TRACE_EVENT(xfs_iwalk_ag_rec, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, \ struct xfs_inobt_rec_incore *irec), - TP_ARGS(mp, agno, irec), + TP_ARGS(pag, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -4273,8 +4707,8 @@ TRACE_EVENT(xfs_iwalk_ag_rec, __field(uint64_t, freemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = irec->ir_startino; __entry->freemask = irec->ir_free; ), @@ -4336,7 +4770,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; __assign_str(name); - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; @@ -4451,7 +4885,7 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); } else { - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = be32_to_cpu(ptr->s); } __entry->nr_records = nr_records; @@ -4676,35 +5110,39 @@ TRACE_EVENT(xfs_force_shutdown, ); #ifdef CONFIG_XFS_DRAIN_INTENTS -DECLARE_EVENT_CLASS(xfs_perag_intents_class, - TP_PROTO(struct xfs_perag *pag, void *caller_ip), - TP_ARGS(pag, caller_ip), +DECLARE_EVENT_CLASS(xfs_group_intents_class, + TP_PROTO(const struct xfs_group *xg, void *caller_ip), + TP_ARGS(xg, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_agnumber_t, agno) + __field(enum xfs_group_type, type) + __field(uint32_t, index) __field(long, nr_intents) __field(void *, caller_ip) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; - __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count); + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->index = xg->xg_gno; + __entry->nr_intents = + atomic_read(&xg->xg_intents_drain.dr_count); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS", + TP_printk("dev %d:%d %sno 0x%x intents %ld caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->index, __entry->nr_intents, __entry->caller_ip) ); -#define DEFINE_PERAG_INTENTS_EVENT(name) \ -DEFINE_EVENT(xfs_perag_intents_class, name, \ - TP_PROTO(struct xfs_perag *pag, void *caller_ip), \ - TP_ARGS(pag, caller_ip)) -DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold); -DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele); -DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); +#define DEFINE_GROUP_INTENTS_EVENT(name) \ +DEFINE_EVENT(xfs_group_intents_class, name, \ + TP_PROTO(const struct xfs_group *xg, void *caller_ip), \ + TP_ARGS(xg, caller_ip)) +DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_hold); +DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_rele); +DEFINE_GROUP_INTENTS_EVENT(xfs_group_wait_intents); #endif /* CONFIG_XFS_DRAIN_INTENTS */ @@ -4809,7 +5247,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class, __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; __entry->bno = xfs_buf_daddr(bp); __entry->nblks = bp->b_length; - __entry->hold = atomic_read(&bp->b_hold); + __entry->hold = bp->b_hold; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; @@ -5332,6 +5770,208 @@ DEFINE_EVENT(xfs_getparents_class, name, \ DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin); DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end); +DECLARE_EVENT_CLASS(xfs_metadir_update_class, + TP_PROTO(const struct xfs_metadir_update *upd), + TP_ARGS(upd), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(xfs_ino_t, ino) + __string(fname, upd->path) + ), + TP_fast_assign( + __entry->dev = upd->dp->i_mount->m_super->s_dev; + __entry->dp_ino = upd->dp->i_ino; + __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO; + __assign_str(fname); + ), + TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(fname), + __entry->ino) +) + +#define DEFINE_METADIR_UPDATE_EVENT(name) \ +DEFINE_EVENT(xfs_metadir_update_class, name, \ + TP_PROTO(const struct xfs_metadir_update *upd), \ + TP_ARGS(upd)) +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_create); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_link); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_commit); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_cancel); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_try_create); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_create); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_link); + +DECLARE_EVENT_CLASS(xfs_metadir_update_error_class, + TP_PROTO(const struct xfs_metadir_update *upd, int error), + TP_ARGS(upd, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(xfs_ino_t, ino) + __field(int, error) + __string(fname, upd->path) + ), + TP_fast_assign( + __entry->dev = upd->dp->i_mount->m_super->s_dev; + __entry->dp_ino = upd->dp->i_ino; + __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO; + __entry->error = error; + __assign_str(fname); + ), + TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(fname), + __entry->ino, + __entry->error) +) + +#define DEFINE_METADIR_UPDATE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_metadir_update_error_class, name, \ + TP_PROTO(const struct xfs_metadir_update *upd, int error), \ + TP_ARGS(upd, error)) +DEFINE_METADIR_UPDATE_ERROR_EVENT(xfs_metadir_teardown); + +DECLARE_EVENT_CLASS(xfs_metadir_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, + xfs_ino_t ino), + TP_ARGS(dp, name, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(xfs_ino_t, ino) + __field(int, ftype) + __field(int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + __entry->ino = ino, + __entry->ftype = name->type; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx type %s name '%.*s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->namelen, + __get_str(name), + __entry->ino) +) + +#define DEFINE_METADIR_EVENT(name) \ +DEFINE_EVENT(xfs_metadir_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, \ + xfs_ino_t ino), \ + TP_ARGS(dp, name, ino)) +DEFINE_METADIR_EVENT(xfs_metadir_lookup); + +/* metadata inode space reservations */ + +DECLARE_EVENT_CLASS(xfs_metafile_resv_class, + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), + TP_ARGS(mp, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, freeblks) + __field(unsigned long long, reserved) + __field(unsigned long long, asked) + __field(unsigned long long, used) + __field(unsigned long long, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + __entry->reserved = mp->m_metafile_resv_avail; + __entry->asked = mp->m_metafile_resv_target; + __entry->used = mp->m_metafile_resv_used; + __entry->len = len; + ), + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->freeblks, + __entry->reserved, + __entry->asked, + __entry->used, + __entry->len) +) +#define DEFINE_METAFILE_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_metafile_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ + TP_ARGS(mp, len)) +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xfs_growfs_check_rtgeom, + TP_PROTO(const struct xfs_mount *mp, unsigned int min_logfsbs), + TP_ARGS(mp, min_logfsbs), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, logblocks) + __field(unsigned int, min_logfsbs) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->logblocks = mp->m_sb.sb_logblocks; + __entry->min_logfsbs = min_logfsbs; + ), + TP_printk("dev %d:%d logblocks %u min_logfsbs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->logblocks, + __entry->min_logfsbs) +); +#endif /* CONFIG_XFS_RT */ + +TRACE_DEFINE_ENUM(XC_FREE_BLOCKS); +TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS); +TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE); + +DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class, + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, + uint64_t delta, unsigned long caller_ip), + TP_ARGS(mp, ctr, delta, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_free_counter, ctr) + __field(uint64_t, delta) + __field(uint64_t, avail) + __field(uint64_t, total) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ctr = ctr; + __entry->delta = delta; + __entry->avail = mp->m_free[ctr].res_avail; + __entry->total = mp->m_free[ctr].res_total; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR), + __entry->delta, + __entry->avail, + __entry->total, + (char *)__entry->caller_ip) +) +#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \ + uint64_t delta, unsigned long caller_ip), \ + TP_ARGS(mp, ctr, delta, caller_ip)) +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bdf3704dc301..c6657072361a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -25,6 +25,8 @@ #include "xfs_dquot.h" #include "xfs_icache.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_sb.h" struct kmem_cache *xfs_trans_cache; @@ -67,7 +69,7 @@ xfs_trans_free( struct xfs_trans *tp) { xfs_extent_busy_sort(&tp->t_busy); - xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_extent_busy_clear(&tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); xfs_trans_clear_context(tp); @@ -98,7 +100,6 @@ xfs_trans_dup( /* * Initialize the new transaction structure. */ - ntp->t_magic = XFS_TRANS_HEADER_MAGIC; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); @@ -273,7 +274,6 @@ retry: ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || xfs_has_lazysbcount(mp)); - tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; tp->t_mountp = mp; INIT_LIST_HEAD(&tp->t_items); @@ -420,6 +420,8 @@ xfs_trans_mod_sb( ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res); } tp->t_frextents_delta += delta; + if (xfs_has_rtgroups(mp)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FREXTENTS: /* @@ -429,6 +431,8 @@ xfs_trans_mod_sb( */ ASSERT(delta < 0); tp->t_res_frextents_delta += delta; + if (xfs_has_rtgroups(mp)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_DBLOCKS: tp->t_dblocks_delta += delta; @@ -455,6 +459,10 @@ xfs_trans_mod_sb( case XFS_TRANS_SB_REXTSLOG: tp->t_rextslog_delta += delta; break; + case XFS_TRANS_SB_RGCOUNT: + ASSERT(delta > 0); + tp->t_rgcount_delta += delta; + break; default: ASSERT(0); return; @@ -497,20 +505,22 @@ xfs_trans_apply_sb_deltas( } /* - * Updating frextents requires careful handling because it does not - * behave like the lazysb counters because we cannot rely on log - * recovery in older kenels to recompute the value from the rtbitmap. - * This means that the ondisk frextents must be consistent with the - * rtbitmap. + * sb_frextents was added to the lazy sb counters when the rt groups + * feature was introduced. This is possible because we know that all + * kernels supporting rtgroups will also recompute frextents from the + * realtime bitmap. + * + * For older file systems, updating frextents requires careful handling + * because we cannot rely on log recovery in older kernels to recompute + * the value from the rtbitmap. This means that the ondisk frextents + * must be consistent with the rtbitmap. * * Therefore, log the frextents change to the ondisk superblock and * update the incore superblock so that future calls to xfs_log_sb * write the correct value ondisk. - * - * Don't touch m_frextents because it includes incore reservations, - * and those are handled by the unreserve function. */ - if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && + !xfs_has_rtgroups(tp->t_mountp)) { struct xfs_mount *mp = tp->t_mountp; int64_t rtxdelta; @@ -536,6 +546,18 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rextsize_delta) { be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); + + /* + * Because the ondisk sb records rtgroup size in units of rt + * extents, any time we update the rt extent size we have to + * recompute the ondisk rtgroup block log. The incore values + * will be recomputed in xfs_trans_unreserve_and_mod_sb. + */ + if (xfs_has_rtgroups(tp->t_mountp)) { + sbp->sb_rgblklog = xfs_compute_rgblklog( + be32_to_cpu(sbp->sb_rgextents), + be32_to_cpu(sbp->sb_rextsize)); + } whole = 1; } if (tp->t_rbmblocks_delta) { @@ -554,6 +576,10 @@ xfs_trans_apply_sb_deltas( sbp->sb_rextslog += tp->t_rextslog_delta; whole = 1; } + if (tp->t_rgcount_delta) { + be32_add_cpu(&sbp->sb_rgcount, tp->t_rgcount_delta); + whole = 1; + } xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); if (whole) @@ -618,7 +644,7 @@ xfs_trans_unreserve_and_mod_sb( } ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0); - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + if (xfs_has_rtgroups(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { rtxdelta += tp->t_frextents_delta; ASSERT(rtxdelta >= 0); } @@ -651,23 +677,21 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; /* - * Do not touch sb_frextents here because we are dealing with incore - * reservation. sb_frextents is not part of the lazy sb counters so it - * must be consistent with the ondisk rtbitmap and must never include - * incore reservations. + * Do not touch sb_frextents here because it is handled in + * xfs_trans_apply_sb_deltas for file systems where it isn't a lazy + * counter anyway. */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; - mp->m_sb.sb_rextsize += tp->t_rextsize_delta; - if (tp->t_rextsize_delta) { - mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize); - mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize); - } + if (tp->t_rextsize_delta) + xfs_mount_sb_set_rextsize(mp, &mp->m_sb, + mp->m_sb.sb_rextsize + tp->t_rextsize_delta); mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; mp->m_sb.sb_rblocks += tp->t_rblocks_delta; mp->m_sb.sb_rextents += tp->t_rextents_delta; mp->m_sb.sb_rextslog += tp->t_rextslog_delta; + mp->m_sb.sb_rgcount += tp->t_rgcount_delta; spin_unlock(&mp->m_sb_lock); /* @@ -834,29 +858,17 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); - error = xfs_trans_run_precommits(tp); - if (error) { - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) - xfs_defer_cancel(tp); - goto out_unreserve; - } - /* - * Finish deferred items on final commit. Only permanent transactions - * should ever have deferred ops. + * Commit per-transaction changes that are not already tracked through + * log items. This can add dirty log items to the transaction. */ - WARN_ON_ONCE(!list_empty(&tp->t_dfops) && - !(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - if (!regrant && (tp->t_flags & XFS_TRANS_PERM_LOG_RES)) { - error = xfs_defer_finish_noroll(&tp); - if (error) - goto out_unreserve; + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); - /* Run precommits from final tx in defer chain. */ - error = xfs_trans_run_precommits(tp); - if (error) - goto out_unreserve; - } + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; /* * If there is nothing to be logged by the transaction, @@ -881,13 +893,6 @@ __xfs_trans_commit( ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); - xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -913,7 +918,7 @@ out_unreserve: * the dqinfo portion to be. All that means is that we have some * (non-persistent) quota reservations that need to be unreserved. */ - xfs_trans_unreserve_and_mod_dquots(tp); + xfs_trans_unreserve_and_mod_dquots(tp, true); if (tp->t_ticket) { if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); @@ -932,6 +937,20 @@ int xfs_trans_commit( struct xfs_trans *tp) { + /* + * Finish deferred items on final commit. Only permanent transactions + * should ever have deferred ops. + */ + WARN_ON_ONCE(!list_empty(&tp->t_dfops) && + !(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) { + int error = xfs_defer_finish_noroll(&tp); + if (error) { + xfs_trans_cancel(tp); + return error; + } + } + return __xfs_trans_commit(tp, false); } @@ -993,7 +1012,7 @@ xfs_trans_cancel( } #endif xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + xfs_trans_unreserve_and_mod_dquots(tp, false); if (tp->t_ticket) { xfs_log_ticket_ungrant(log, tp->t_ticket); @@ -1245,6 +1264,9 @@ retry: xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + if (xfs_is_metadir_inode(ip)) + goto out; + error = xfs_qm_dqattach_locked(ip, false); if (error) { /* Caller should have allocated the dquots! */ @@ -1262,11 +1284,26 @@ retry: gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL; pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL; if (udqp || gdqp || pdqp) { + xfs_filblks_t dblocks, rblocks; unsigned int qflags = XFS_QMOPT_RES_REGBLKS; + bool isrt = XFS_IS_REALTIME_INODE(ip); if (force) qflags |= XFS_QMOPT_FORCE_RES; + if (isrt) { + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_cancel; + } + + xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks); + + if (isrt) + rblocks += ip->i_delayed_blks; + else + dblocks += ip->i_delayed_blks; + /* * Reserve enough quota to handle blocks on disk and reserved * for a delayed allocation. We'll actually transfer the @@ -1274,8 +1311,20 @@ retry: * though that part is only semi-transactional. */ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, - pdqp, ip->i_nblocks + ip->i_delayed_blks, - 1, qflags); + pdqp, dblocks, 1, qflags); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } + if (error) + goto out_cancel; + + /* Do the same for realtime. */ + qflags = XFS_QMOPT_RES_RTBLKS | (qflags & XFS_QMOPT_FORCE_RES); + error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, + pdqp, rblocks, 0, qflags); if ((error == -EDQUOT || error == -ENOSPC) && !retried) { xfs_trans_cancel(tp); xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); @@ -1286,6 +1335,7 @@ retry: goto out_cancel; } +out: *tpp = tp; return 0; @@ -1382,5 +1432,8 @@ done: out_cancel: xfs_trans_cancel(tp); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (dp != ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f06cc0f41665..2b366851e9a4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -122,7 +122,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, * This is the structure maintained for every active transaction. */ typedef struct xfs_trans { - unsigned int t_magic; /* magic number */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ unsigned int t_blk_res; /* # of blocks resvd */ @@ -148,6 +147,7 @@ typedef struct xfs_trans { int64_t t_rblocks_delta;/* superblock rblocks change */ int64_t t_rextents_delta;/* superblocks rextents chg */ int64_t t_rextslog_delta;/* superblocks rextslog chg */ + int64_t t_rgcount_delta; /* realtime group count */ struct list_head t_items; /* log item descriptors */ struct list_head t_busy; /* list of busy extents */ struct list_head t_dfops; /* deferred operations */ @@ -214,6 +214,7 @@ xfs_trans_read_buf( } struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); +struct xfs_buf *xfs_trans_getrtsb(struct xfs_trans *tp); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 8ede9d099d1f..67c328d23e4a 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -315,7 +315,7 @@ xfs_ail_splice( } /* - * Delete the given item from the AIL. Return a pointer to the item. + * Delete the given item from the AIL. */ static void xfs_ail_delete( @@ -359,13 +359,8 @@ xfsaild_resubmit_item( } /* protected by ail_lock */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (bp->b_flags & _XBF_INODES) - clear_bit(XFS_LI_FAILED, &lip->li_flags); - else - xfs_clear_li_failed(lip); - } - + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + clear_bit(XFS_LI_FAILED, &lip->li_flags); xfs_buf_unlock(bp); return XFS_ITEM_SUCCESS; } @@ -782,26 +777,28 @@ xfs_ail_update_finish( } /* - * xfs_trans_ail_update - bulk AIL insertion operation. + * xfs_trans_ail_update_bulk - bulk AIL insertion operation. * - * @xfs_trans_ail_update takes an array of log items that all need to be + * @xfs_trans_ail_update_bulk takes an array of log items that all need to be * positioned at the same LSN in the AIL. If an item is not in the AIL, it will - * be added. Otherwise, it will be repositioned by removing it and re-adding - * it to the AIL. If we move the first item in the AIL, update the log tail to - * match the new minimum LSN in the AIL. + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. + * + * If we move the first item in the AIL, update the log tail to match the new + * minimum LSN in the AIL. * - * This function takes the AIL lock once to execute the update operations on - * all the items in the array, and as such should not be called with the AIL - * lock held. As a result, once we have the AIL lock, we need to check each log - * item LSN to confirm it needs to be moved forward in the AIL. + * This function should be called with the AIL lock held. * - * To optimise the insert operation, we delete all the items from the AIL in - * the first pass, moving them into a temporary list, then splice the temporary - * list into the correct position in the AIL. This avoids needing to do an - * insert operation on every item. + * To optimise the insert operation, we add all items to a temporary list, then + * splice this list into the correct position in the AIL. * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. + * Items that are already in the AIL are first deleted from their current + * location before being added to the temporary list. + * + * This avoids needing to do an insert operation on every item. + * + * The AIL lock is dropped by xfs_ail_update_finish() before returning to + * the caller. */ void xfs_trans_ail_update_bulk( @@ -914,10 +911,9 @@ xfs_trans_ail_delete( return; } - /* xfs_ail_update_finish() drops the AIL lock */ - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); tail_lsn = xfs_ail_delete_one(ailp, lip); - xfs_ail_update_finish(ailp, tail_lsn); + xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */ } int diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index e28ab74af4f0..53af546c0b23 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -168,12 +168,11 @@ xfs_trans_get_buf_map( /* * Get and lock the superblock buffer for the given transaction. */ -struct xfs_buf * -xfs_trans_getsb( - struct xfs_trans *tp) +static struct xfs_buf * +__xfs_trans_getsb( + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf *bp = tp->t_mountp->m_sb_bp; - /* * Just increment the lock recursion count if the buffer is already * attached to this transaction. @@ -197,6 +196,22 @@ xfs_trans_getsb( return bp; } +struct xfs_buf * +xfs_trans_getsb( + struct xfs_trans *tp) +{ + return __xfs_trans_getsb(tp, tp->t_mountp->m_sb_bp); +} + +struct xfs_buf * +xfs_trans_getrtsb( + struct xfs_trans *tp) +{ + if (!tp->t_mountp->m_rtsb_bp) + return NULL; + return __xfs_trans_getsb(tp, tp->t_mountp->m_rtsb_bp); +} + /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been @@ -644,7 +659,7 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; - bp->b_flags |= _XBF_INODES; + bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -669,7 +684,7 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bp->b_flags |= _XBF_INODES; + bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -694,7 +709,7 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; - bp->b_flags |= _XBF_INODES; + bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -805,6 +820,6 @@ xfs_trans_dquot_buf( break; } - bp->b_flags |= _XBF_DQUOTS; + bp->b_iodone = xfs_buf_dquot_iodone; xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index b368e13424c4..765456bf3428 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -156,6 +156,9 @@ xfs_trans_mod_ino_dquot( unsigned int field, int64_t delta) { + if (xfs_is_metadir_inode(ip)) + return; + xfs_trans_mod_dquot(tp, dqp, field, delta); if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { @@ -244,7 +247,8 @@ xfs_trans_mod_dquot_byino( xfs_mount_t *mp = tp->t_mountp; if (!XFS_IS_QUOTA_ON(mp) || - xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + xfs_is_quota_inode(&mp->m_sb, ip->i_ino) || + xfs_is_metadir_inode(ip)) return; if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) @@ -602,6 +606,24 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); + + /* + * We've applied the count changes and given back + * whatever reservation we didn't use. Zero out the + * dqtrx fields. + */ + qtrx->qt_blk_res = 0; + qtrx->qt_bcount_delta = 0; + qtrx->qt_delbcnt_delta = 0; + + qtrx->qt_rtblk_res = 0; + qtrx->qt_rtblk_res_used = 0; + qtrx->qt_rtbcount_delta = 0; + qtrx->qt_delrtb_delta = 0; + + qtrx->qt_ino_res = 0; + qtrx->qt_ino_res_used = 0; + qtrx->qt_icount_delta = 0; } } } @@ -638,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook( */ void xfs_trans_unreserve_and_mod_dquots( - struct xfs_trans *tp) + struct xfs_trans *tp, + bool already_locked) { int i, j; struct xfs_dquot *dqp; @@ -667,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = false; + locked = already_locked; if (qtrx->qt_blk_res) { - xfs_dqlock(dqp); - locked = true; + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } dqp->q_blk.reserved -= (xfs_qcnt_t)qtrx->qt_blk_res; } @@ -691,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots( dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } - if (locked) + if (locked && !already_locked) xfs_dqunlock(dqp); } @@ -962,6 +987,8 @@ xfs_trans_reserve_quota_nblks( if (!XFS_IS_QUOTA_ON(mp)) return 0; + if (xfs_is_metadir_inode(ip)) + return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); @@ -1025,3 +1052,14 @@ xfs_trans_free_dqinfo( kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo); tp->t_dqinfo = NULL; } + +int +xfs_quota_reserve_blkres( + struct xfs_inode *ip, + int64_t blocks) +{ + if (XFS_IS_REALTIME_INODE(ip)) + return xfs_trans_reserve_quota_nblks(NULL, ip, 0, blocks, + false); + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); +} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd841df93021..f945f0450b16 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn( } #endif -static inline void -xfs_clear_li_failed( - struct xfs_log_item *lip) -{ - struct xfs_buf *bp = lip->li_buf; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { - lip->li_buf = NULL; - xfs_buf_rele(bp); - } -} - -static inline void -xfs_set_li_failed( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { - xfs_buf_hold(bp); - lip->li_buf = bp; - } -} - #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index eaf849260bd6..0f641a9091ec 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -51,8 +51,7 @@ xfs_attr_grab_log_assist( return error; xfs_set_using_logged_xattrs(mp); - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, - "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LARP); return 0; } diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c new file mode 100644 index 000000000000..80add26c0111 --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.c @@ -0,0 +1,1336 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_error.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_iomap.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_refcount.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" +#include "xfs_mru_cache.h" + +void +xfs_open_zone_put( + struct xfs_open_zone *oz) +{ + if (atomic_dec_and_test(&oz->oz_ref)) { + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); + } +} + +static inline uint32_t +xfs_zone_bucket( + struct xfs_mount *mp, + uint32_t used_blocks) +{ + return XFS_ZONE_USED_BUCKETS * used_blocks / + mp->m_groups[XG_TYPE_RTG].blocks; +} + +static inline void +xfs_zone_add_to_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t to_bucket) +{ + __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); + zi->zi_used_bucket_entries[to_bucket]++; +} + +static inline void +xfs_zone_remove_from_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t from_bucket) +{ + __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); + zi->zi_used_bucket_entries[from_bucket]--; +} + +static void +xfs_zone_account_reclaimable( + struct xfs_rtgroup *rtg, + uint32_t freed) +{ + struct xfs_group *xg = &rtg->rtg_group; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgnumber_t rgno = rtg_rgno(rtg); + uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); + uint32_t to_bucket = xfs_zone_bucket(mp, used); + bool was_full = (used + freed == rtg_blocks(rtg)); + + /* + * This can be called from log recovery, where the zone_info structure + * hasn't been allocated yet. Skip all work as xfs_mount_zones will + * add the zones to the right buckets before the file systems becomes + * active. + */ + if (!zi) + return; + + if (!used) { + /* + * The zone is now empty, remove it from the bottom bucket and + * trigger a reset. + */ + trace_xfs_zone_emptied(rtg); + + if (!was_full) + xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); + + spin_lock(&zi->zi_used_buckets_lock); + if (!was_full) + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + spin_lock(&zi->zi_reset_list_lock); + xg->xg_next_reset = zi->zi_reset_list; + zi->zi_reset_list = xg; + spin_unlock(&zi->zi_reset_list_lock); + + if (zi->zi_gc_thread) + wake_up_process(zi->zi_gc_thread); + } else if (was_full) { + /* + * The zone transitioned from full, mark it up as reclaimable + * and wake up GC which might be waiting for zones to reclaim. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); + if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + } else if (to_bucket != from_bucket) { + /* + * Move the zone to a new bucket if it dropped below the + * threshold. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + } +} + +static void +xfs_open_zone_mark_full( + struct xfs_open_zone *oz) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + + trace_xfs_zone_full(rtg); + + WRITE_ONCE(rtg->rtg_open_zone, NULL); + + spin_lock(&zi->zi_open_zones_lock); + if (oz->oz_is_gc) { + ASSERT(current == zi->zi_gc_thread); + zi->zi_open_gc_zone = NULL; + } else { + zi->zi_nr_open_zones--; + list_del_init(&oz->oz_entry); + } + spin_unlock(&zi->zi_open_zones_lock); + xfs_open_zone_put(oz); + + wake_up_all(&zi->zi_zone_wait); + if (used < rtg_blocks(rtg)) + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); +} + +static void +xfs_zone_record_blocks( + struct xfs_trans *tp, + xfs_fsblock_t fsbno, + xfs_filblks_t len, + struct xfs_open_zone *oz, + bool used) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + if (used) { + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + } else { + xfs_add_frextents(mp, len); + } + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); +} + +static int +xfs_zoned_map_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_bmbt_irec data; + int nmaps = 1; + int error; + + /* Grab the corresponding mapping in the data fork. */ + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, + &nmaps, 0); + if (error) + return error; + + /* + * Cap the update to the existing extent in the data fork because we can + * only overwrite one extent at a time. + */ + ASSERT(new->br_blockcount >= data.br_blockcount); + new->br_blockcount = data.br_blockcount; + + /* + * If a data write raced with this GC write, keep the existing data in + * the data fork, mark our newly written GC extent as reclaimable, then + * move on to the next extent. + */ + if (old_startblock != NULLFSBLOCK && + old_startblock != data.br_startblock) + goto skip; + + trace_xfs_reflink_cow_remap_from(ip, new); + trace_xfs_reflink_cow_remap_to(ip, &data); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + return error; + + if (data.br_startblock != HOLESTARTBLOCK) { + ASSERT(data.br_startblock != DELAYSTARTBLOCK); + ASSERT(!isnullstartblock(data.br_startblock)); + + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); + if (xfs_is_reflink_inode(ip)) { + xfs_refcount_decrease_extent(tp, true, &data); + } else { + error = xfs_free_extent_later(tp, data.br_startblock, + data.br_blockcount, NULL, + XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_REALTIME); + if (error) + return error; + } + } + + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + true); + + /* Map the new blocks into the data fork. */ + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); + return 0; + +skip: + trace_xfs_reflink_cow_remap_skip(ip, new); + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + false); + return 0; +} + +int +xfs_zoned_end_io( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count, + xfs_daddr_t daddr, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + struct xfs_bmbt_irec new = { + .br_startoff = XFS_B_TO_FSBT(mp, offset), + .br_startblock = xfs_daddr_to_rtb(mp, daddr), + .br_state = XFS_EXT_NORM, + }; + unsigned int resblks = + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + struct xfs_trans *tp; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + + while (new.br_startoff < end_fsb) { + new.br_blockcount = end_fsb - new.br_startoff; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + new.br_startoff += new.br_blockcount; + new.br_startblock += new.br_blockcount; + if (old_startblock != NULLFSBLOCK) + old_startblock += new.br_blockcount; + } + + return 0; +} + +/* + * "Free" blocks allocated in a zone. + * + * Just decrement the used blocks counter and report the space as freed. + */ +int +xfs_zone_free_blocks( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); + + if (len > rmapip->i_used_blocks) { + xfs_err(mp, +"trying to free more blocks (%lld) than used counter (%u).", + len, rmapip->i_used_blocks); + ASSERT(len <= rmapip->i_used_blocks); + xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; + } + + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); + + rmapip->i_used_blocks -= len; + /* + * Don't add open zones to the reclaimable buckets. The I/O completion + * for writing the last block will take care of accounting for already + * unused blocks instead. + */ + if (!READ_ONCE(rtg->rtg_open_zone)) + xfs_zone_account_reclaimable(rtg, len); + xfs_add_frextents(mp, len); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); + return 0; +} + +/* + * Check if the zone containing the data just before the offset we are + * writing to is still open and has space. + */ +static struct xfs_open_zone * +xfs_last_used_zone( + struct iomap_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); + struct xfs_rtgroup *rtg = NULL; + struct xfs_open_zone *oz = NULL; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, + &icur, &got)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return NULL; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); + if (!rtg) + return NULL; + + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + oz = READ_ONCE(rtg->rtg_open_zone); + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) + oz = NULL; + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + + xfs_rtgroup_rele(rtg); + return oz; +} + +static struct xfs_group * +xfs_find_free_zone( + struct xfs_mount *mp, + unsigned long start, + unsigned long end) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); + struct xfs_group *xg; + + xas_lock(&xas); + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) + if (atomic_inc_not_zero(&xg->xg_active_ref)) + goto found; + xas_unlock(&xas); + return NULL; + +found: + xas_clear_mark(&xas, XFS_RTG_FREE); + atomic_dec(&zi->zi_nr_free_zones); + zi->zi_free_zone_cursor = xg->xg_gno; + xas_unlock(&xas); + return xg; +} + +static struct xfs_open_zone * +xfs_init_open_zone( + struct xfs_rtgroup *rtg, + xfs_rgblock_t write_pointer, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_open_zone *oz; + + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); + spin_lock_init(&oz->oz_alloc_lock); + atomic_set(&oz->oz_ref, 1); + oz->oz_rtg = rtg; + oz->oz_write_pointer = write_pointer; + oz->oz_written = write_pointer; + oz->oz_write_hint = write_hint; + oz->oz_is_gc = is_gc; + + /* + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap + * inode, but we don't really want to take that here because we are + * under the zone_list_lock. Ensure the pointer is only set for a fully + * initialized open zone structure so that a racy lookup finding it is + * fine. + */ + WRITE_ONCE(rtg->rtg_open_zone, oz); + return oz; +} + +/* + * Find a completely free zone, open it, and return a reference. + */ +struct xfs_open_zone * +xfs_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_group *xg; + + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); + if (!xg) + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); + if (!xg) + return NULL; + + set_current_state(TASK_RUNNING); + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); +} + +static struct xfs_open_zone * +xfs_try_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) + return NULL; + if (atomic_read(&zi->zi_nr_free_zones) < + XFS_GC_ZONES - XFS_OPEN_GC_ZONES) + return NULL; + + /* + * Increment the open zone count to reserve our slot before dropping + * zi_open_zones_lock. + */ + zi->zi_nr_open_zones++; + spin_unlock(&zi->zi_open_zones_lock); + oz = xfs_open_zone(mp, write_hint, false); + spin_lock(&zi->zi_open_zones_lock); + if (!oz) { + zi->zi_nr_open_zones--; + return NULL; + } + + atomic_inc(&oz->oz_ref); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + + /* + * If this was the last free zone, other waiters might be waiting + * on us to write to it as well. + */ + wake_up_all(&zi->zi_zone_wait); + + if (xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + + trace_xfs_zone_opened(oz->oz_rtg); + return oz; +} + +/* + * For data with short or medium lifetime, try to colocated it into an + * already open zone with a matching temperature. + */ +static bool +xfs_colocate_eagerly( + enum rw_hint file_hint) +{ + switch (file_hint) { + case WRITE_LIFE_MEDIUM: + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + return true; + default: + return false; + } +} + +static bool +xfs_good_hint_match( + struct xfs_open_zone *oz, + enum rw_hint file_hint) +{ + switch (oz->oz_write_hint) { + case WRITE_LIFE_LONG: + case WRITE_LIFE_EXTREME: + /* colocate long and extreme */ + if (file_hint == WRITE_LIFE_LONG || + file_hint == WRITE_LIFE_EXTREME) + return true; + break; + case WRITE_LIFE_MEDIUM: + /* colocate medium with medium */ + if (file_hint == WRITE_LIFE_MEDIUM) + return true; + break; + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + case WRITE_LIFE_NOT_SET: + /* colocate short and none */ + if (file_hint <= WRITE_LIFE_SHORT) + return true; + break; + } + return false; +} + +static bool +xfs_try_use_zone( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + struct xfs_open_zone *oz, + bool lowspace) +{ + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return false; + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) + return false; + + /* + * If we have a hint set for the data, use that for the zone even if + * some data was written already without any hint set, but don't change + * the temperature after that as that would make little sense without + * tracking per-temperature class written block counts, which is + * probably overkill anyway. + */ + if (file_hint != WRITE_LIFE_NOT_SET && + oz->oz_write_hint == WRITE_LIFE_NOT_SET) + oz->oz_write_hint = file_hint; + + /* + * If we couldn't match by inode or life time we just pick the first + * zone with enough space above. For that we want the least busy zone + * for some definition of "least" busy. For now this simple LRU + * algorithm that rotates every zone to the end of the list will do it, + * even if it isn't exactly cache friendly. + */ + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) + list_move_tail(&oz->oz_entry, &zi->zi_open_zones); + return true; +} + +static struct xfs_open_zone * +xfs_select_open_zone_lru( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + bool lowspace) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static struct xfs_open_zone * +xfs_select_open_zone_mru( + struct xfs_zone_info *zi, + enum rw_hint file_hint) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, false)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) +{ + if (xfs_has_nolifetime(ip->i_mount)) + return WRITE_LIFE_NOT_SET; + return VFS_I(ip)->i_write_hint; +} + +/* + * Try to pack inodes that are written back after they were closed tight instead + * of trying to open new zones for them or spread them to the least recently + * used zone. This optimizes the data layout for workloads that untar or copy + * a lot of small files. Right now this does not separate multiple such + * streams. + */ +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) +{ + return !inode_is_open_for_write(VFS_I(ip)) && + !(ip->i_diflags & XFS_DIFLAG_APPEND); +} + +/* + * Pick a new zone for writes. + * + * If we aren't using up our budget of open zones just open a new one from the + * freelist. Else try to find one that matches the expected data lifetime. If + * we don't find one that is good pick any zone that is available. + */ +static struct xfs_open_zone * +xfs_select_zone_nowait( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = NULL; + + if (xfs_is_shutdown(mp)) + return NULL; + + /* + * Try to fill up open zones with matching temperature if available. It + * is better to try to co-locate data when this is favorable, so we can + * activate empty zones when it is statistically better to separate + * data. + */ + spin_lock(&zi->zi_open_zones_lock); + if (xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + else if (pack_tight) + oz = xfs_select_open_zone_mru(zi, write_hint); + if (oz) + goto out_unlock; + + /* + * See if we can open a new zone and use that. + */ + oz = xfs_try_open_zone(mp, write_hint); + if (oz) + goto out_unlock; + + /* + * Try to colocate cold data with other cold data if we failed to open a + * new zone for it. + */ + if (write_hint != WRITE_LIFE_NOT_SET && + !xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); +out_unlock: + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +static struct xfs_open_zone * +xfs_select_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + DEFINE_WAIT (wait); + struct xfs_open_zone *oz; + + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + return oz; + + for (;;) { + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + break; + schedule(); + } + finish_wait(&zi->zi_zone_wait, &wait); + return oz; +} + +static unsigned int +xfs_zone_alloc_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t count_fsb, + sector_t *sector, + bool *is_seq) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgblock_t rgbno; + + spin_lock(&oz->oz_alloc_lock); + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + if (!count_fsb) { + spin_unlock(&oz->oz_alloc_lock); + return 0; + } + rgbno = oz->oz_write_pointer; + oz->oz_write_pointer += count_fsb; + spin_unlock(&oz->oz_alloc_lock); + + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + + *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); + if (!*is_seq) + *sector += XFS_FSB_TO_BB(mp, rgbno); + return XFS_FSB_TO_B(mp, count_fsb); +} + +void +xfs_mark_rtg_boundary( + struct iomap_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + sector_t sector = ioend->io_bio.bi_iter.bi_sector; + + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) + ioend->io_flags |= IOMAP_IOEND_BOUNDARY; +} + +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + +/* + * Cache the last zone written to for an inode so that it is considered first + * for subsequent writes. + */ +struct xfs_zone_cache_item { + struct xfs_mru_cache_elem mru; + struct xfs_open_zone *oz; +}; + +static inline struct xfs_zone_cache_item * +xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) +{ + return container_of(mru, struct xfs_zone_cache_item, mru); +} + +static void +xfs_zone_cache_free_func( + void *data, + struct xfs_mru_cache_elem *mru) +{ + struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); + + xfs_open_zone_put(item->oz); + kfree(item); +} + +/* + * Check if we have a cached last open zone available for the inode and + * if yes return a reference to it. + */ +static struct xfs_open_zone * +xfs_cached_zone( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + struct xfs_mru_cache_elem *mru; + struct xfs_open_zone *oz; + + mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); + if (!mru) + return NULL; + oz = xfs_zone_cache_item(mru)->oz; + if (oz) { + /* + * GC only steals open zones at mount time, so no GC zones + * should end up in the cache. + */ + ASSERT(!oz->oz_is_gc); + ASSERT(atomic_read(&oz->oz_ref) > 0); + atomic_inc(&oz->oz_ref); + } + xfs_mru_cache_done(mp->m_zone_cache); + return oz; +} + +/* + * Update the last used zone cache for a given inode. + * + * The caller must have a reference on the open zone. + */ +static void +xfs_zone_cache_create_association( + struct xfs_inode *ip, + struct xfs_open_zone *oz) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_cache_item *item = NULL; + struct xfs_mru_cache_elem *mru; + + ASSERT(atomic_read(&oz->oz_ref) > 0); + atomic_inc(&oz->oz_ref); + + mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); + if (mru) { + /* + * If we have an association already, update it to point to the + * new zone. + */ + item = xfs_zone_cache_item(mru); + xfs_open_zone_put(item->oz); + item->oz = oz; + xfs_mru_cache_done(mp->m_zone_cache); + return; + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + xfs_open_zone_put(oz); + return; + } + item->oz = oz; + xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); +} + +void +xfs_zone_alloc_and_submit( + struct iomap_ioend *ioend, + struct xfs_open_zone **oz) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + enum rw_hint write_hint = xfs_inode_write_hint(ip); + bool pack_tight = xfs_zoned_pack_tight(ip); + unsigned int alloc_len; + struct iomap_ioend *split; + bool is_seq; + + if (xfs_is_shutdown(mp)) + goto out_error; + + /* + * If we don't have a cached zone in this write context, see if the + * last extent before the one we are writing to points to an active + * zone. If so, just continue writing to it. + */ + if (!*oz && ioend->io_offset) + *oz = xfs_last_used_zone(ioend); + if (!*oz) + *oz = xfs_cached_zone(mp, ip); + + if (!*oz) { +select_zone: + *oz = xfs_select_zone(mp, write_hint, pack_tight); + if (!*oz) + goto out_error; + + xfs_zone_cache_create_association(ip, *oz); + } + + alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), + &ioend->io_sector, &is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + + while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { + if (IS_ERR(split)) + goto out_split_error; + alloc_len -= split->io_bio.bi_iter.bi_size; + xfs_submit_zoned_bio(split, *oz, is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + } + + xfs_submit_zoned_bio(ioend, *oz, is_seq); + return; + +out_split_error: + ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); +out_error: + bio_io_error(&ioend->io_bio); +} + +/* + * Wake up all threads waiting for a zoned space allocation when the file system + * is shut down. + */ +void +xfs_zoned_wake_all( + struct xfs_mount *mp) +{ + /* + * Don't wake up if there is no m_zone_info. This is complicated by the + * fact that unmount can't atomically clear m_zone_info and thus we need + * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE + * during log recovery so we can't entirely rely on that either. + */ + if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) + wake_up_all(&mp->m_zone_info->zi_zone_wait); +} + +/* + * Check if @rgbno in @rgb is a potentially valid block. It might still be + * unused, but that information is only found in the rmap. + */ +bool +xfs_zone_rgbno_is_valid( + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgbno) +{ + lockdep_assert_held(&rtg_rmap(rtg)->i_lock); + + if (rtg->rtg_open_zone) + return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, + rtg_rgno(rtg), XFS_RTG_FREE); +} + +static void +xfs_free_open_zones( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz; + + spin_lock(&zi->zi_open_zones_lock); + while ((oz = list_first_entry_or_null(&zi->zi_open_zones, + struct xfs_open_zone, oz_entry))) { + list_del(&oz->oz_entry); + xfs_open_zone_put(oz); + } + spin_unlock(&zi->zi_open_zones_lock); +} + +struct xfs_init_zones { + struct xfs_mount *mp; + uint64_t available; + uint64_t reclaimable; +}; + +static int +xfs_init_zone( + struct xfs_init_zones *iz, + struct xfs_rtgroup *rtg, + struct blk_zone *zone) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint64_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgblock_t write_pointer, highest_rgbno; + int error; + + if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) + return -EFSCORRUPTED; + + /* + * For sequential write required zones we retrieved the hardware write + * pointer above. + * + * For conventional zones or conventional devices we don't have that + * luxury. Instead query the rmap to find the highest recorded block + * and set the write pointer to the block after that. In case of a + * power loss this misses blocks where the data I/O has completed but + * not recorded in the rmap yet, and it also rewrites blocks if the most + * recently written ones got deleted again before unmount, but this is + * the best we can do without hardware support. + */ + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + write_pointer = 0; + else + write_pointer = highest_rgbno + 1; + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + } + + /* + * If there are no used blocks, but the zone is not in empty state yet + * we lost power before the zoned reset. In that case finish the work + * here. + */ + if (write_pointer == rtg_blocks(rtg) && used == 0) { + error = xfs_zone_gc_reset_sync(rtg); + if (error) + return error; + write_pointer = 0; + } + + if (write_pointer == 0) { + /* zone is empty */ + atomic_inc(&zi->zi_nr_free_zones); + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + iz->available += rtg_blocks(rtg); + } else if (write_pointer < rtg_blocks(rtg)) { + /* zone is open */ + struct xfs_open_zone *oz; + + atomic_inc(&rtg_group(rtg)->xg_active_ref); + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, + false); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + zi->zi_nr_open_zones++; + + iz->available += (rtg_blocks(rtg) - write_pointer); + iz->reclaimable += write_pointer - used; + } else if (used < rtg_blocks(rtg)) { + /* zone fully written, but has freed blocks */ + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); + iz->reclaimable += (rtg_blocks(rtg) - used); + } + + return 0; +} + +static int +xfs_get_zone_info_cb( + struct blk_zone *zone, + unsigned int idx, + void *data) +{ + struct xfs_init_zones *iz = data; + struct xfs_mount *mp = iz->mp; + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); + xfs_rgnumber_t rgno; + struct xfs_rtgroup *rtg; + int error; + + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); + return -EFSCORRUPTED; + } + + rgno = xfs_rtb_to_rgno(mp, zsbno); + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) { + xfs_warn(mp, "realtime group not found for zone %u.", rgno); + return -EFSCORRUPTED; + } + error = xfs_init_zone(iz, rtg, zone); + xfs_rtgroup_rele(rtg); + return error; +} + +/* + * Calculate the max open zone limit based on the of number of + * backing zones available + */ +static inline uint32_t +xfs_max_open_zones( + struct xfs_mount *mp) +{ + unsigned int max_open, max_open_data_zones; + /* + * We need two zones for every open data zone, + * one in reserve as we don't reclaim open zones. One data zone + * and its spare is included in XFS_MIN_ZONES. + */ + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; + + /* + * Cap the max open limit to 1/4 of available space + */ + max_open = min(max_open, mp->m_sb.sb_rgcount / 4); + + return max(XFS_MIN_OPEN_ZONES, max_open); +} + +/* + * Normally we use the open zone limit that the device reports. If there is + * none let the user pick one from the command line. + * + * If the device doesn't report an open zone limit and there is no override, + * allow to hold about a quarter of the zones open. In theory we could allow + * all to be open, but at that point we run into GC deadlocks because we can't + * reclaim open zones. + * + * When used on conventional SSDs a lower open limit is advisable as we'll + * otherwise overwhelm the FTL just as much as a conventional block allocator. + * + * Note: To debug the open zone management code, force max_open to 1 here. + */ +static int +xfs_calc_open_zones( + struct xfs_mount *mp) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + unsigned int bdev_open_zones = bdev_max_open_zones(bdev); + + if (!mp->m_max_open_zones) { + if (bdev_open_zones) + mp->m_max_open_zones = bdev_open_zones; + else + mp->m_max_open_zones = xfs_max_open_zones(mp); + } + + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { + xfs_notice(mp, "need at least %u open zones.", + XFS_MIN_OPEN_ZONES); + return -EIO; + } + + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { + mp->m_max_open_zones = bdev_open_zones; + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + bdev_open_zones); + } + + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { + mp->m_max_open_zones = xfs_max_open_zones(mp); + xfs_info(mp, +"limiting open zones to %u due to total zone count (%u)", + mp->m_max_open_zones, mp->m_sb.sb_rgcount); + } + + return 0; +} + +static unsigned long * +xfs_alloc_bucket_bitmap( + struct xfs_mount *mp) +{ + return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), + sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); +} + +static struct xfs_zone_info * +xfs_alloc_zone_info( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi; + int i; + + zi = kzalloc(sizeof(*zi), GFP_KERNEL); + if (!zi) + return NULL; + INIT_LIST_HEAD(&zi->zi_open_zones); + INIT_LIST_HEAD(&zi->zi_reclaim_reservations); + spin_lock_init(&zi->zi_reset_list_lock); + spin_lock_init(&zi->zi_open_zones_lock); + spin_lock_init(&zi->zi_reservation_lock); + init_waitqueue_head(&zi->zi_zone_wait); + spin_lock_init(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); + if (!zi->zi_used_bucket_bitmap[i]) + goto out_free_bitmaps; + } + return zi; + +out_free_bitmaps: + while (--i > 0) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); + return NULL; +} + +static void +xfs_free_zone_info( + struct xfs_zone_info *zi) +{ + int i; + + xfs_free_open_zones(zi); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); +} + +int +xfs_mount_zones( + struct xfs_mount *mp) +{ + struct xfs_init_zones iz = { + .mp = mp, + }; + struct xfs_buftarg *bt = mp->m_rtdev_targp; + int error; + + if (!bt) { + xfs_notice(mp, "RT device missing."); + return -EINVAL; + } + + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { + xfs_notice(mp, "invalid flag combination."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rextsize != 1) { + xfs_notice(mp, "zoned file systems do not support rextsize."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { + xfs_notice(mp, +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); + return -EFSCORRUPTED; + } + + error = xfs_calc_open_zones(mp); + if (error) + return error; + + mp->m_zone_info = xfs_alloc_zone_info(mp); + if (!mp->m_zone_info) + return -ENOMEM; + + xfs_info(mp, "%u zones of %u blocks size (%u max open)", + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, + mp->m_max_open_zones); + trace_xfs_zones_mount(mp); + + if (bdev_is_zoned(bt->bt_bdev)) { + error = blkdev_report_zones(bt->bt_bdev, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); + if (error < 0) + goto out_free_zone_info; + } else { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_init_zone(&iz, rtg, NULL); + if (error) + goto out_free_zone_info; + } + } + + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + iz.available + iz.reclaimable); + + /* + * The user may configure GC to free up a percentage of unused blocks. + * By default this is 0. GC will always trigger at the minimum level + * for keeping max_open_zones available for data placement. + */ + mp->m_zonegc_low_space = 0; + + error = xfs_zone_gc_mount(mp); + if (error) + goto out_free_zone_info; + + /* + * Set up a mru cache to track inode to open zone for data placement + * purposes. The magic values for group count and life time is the + * same as the defaults for file streams, which seems sane enough. + */ + xfs_mru_cache_create(&mp->m_zone_cache, mp, + 5000, 10, xfs_zone_cache_free_func); + return 0; + +out_free_zone_info: + xfs_free_zone_info(mp->m_zone_info); + return error; +} + +void +xfs_unmount_zones( + struct xfs_mount *mp) +{ + xfs_zone_gc_unmount(mp); + xfs_free_zone_info(mp->m_zone_info); + xfs_mru_cache_destroy(mp->m_zone_cache); +} diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h new file mode 100644 index 000000000000..ecf39106704c --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_ALLOC_H +#define _XFS_ZONE_ALLOC_H + +struct iomap_ioend; +struct xfs_open_zone; + +struct xfs_zone_alloc_ctx { + struct xfs_open_zone *open_zone; + xfs_filblks_t reserved_blocks; +}; + +/* + * Grab any available space, even if it is less than what the caller asked for. + */ +#define XFS_ZR_GREEDY (1U << 0) +/* + * Only grab instantly available space, don't wait or GC. + */ +#define XFS_ZR_NOWAIT (1U << 1) +/* + * Dip into the reserved pool. + */ +#define XFS_ZR_RESERVED (1U << 2) + +int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, + unsigned int flags, struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_space_unreserve(struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); + +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, + struct xfs_open_zone **oz); +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, xfs_filblks_t len); +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, + xfs_daddr_t daddr, struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock); +void xfs_open_zone_put(struct xfs_open_zone *oz); + +void xfs_zoned_wake_all(struct xfs_mount *mp); +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); + +uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); +void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp); + +#ifdef CONFIG_XFS_RT +int xfs_mount_zones(struct xfs_mount *mp); +void xfs_unmount_zones(struct xfs_mount *mp); +void xfs_zone_gc_start(struct xfs_mount *mp); +void xfs_zone_gc_stop(struct xfs_mount *mp); +#else +static inline int xfs_mount_zones(struct xfs_mount *mp) +{ + return -EIO; +} +static inline void xfs_unmount_zones(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_start(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_stop(struct xfs_mount *mp) +{ +} +#endif /* CONFIG_XFS_RT */ + +#endif /* _XFS_ZONE_ALLOC_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c new file mode 100644 index 000000000000..9c00fc5baa30 --- /dev/null +++ b/fs/xfs/xfs_zone_gc.c @@ -0,0 +1,1182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +/* + * Implement Garbage Collection (GC) of partially used zoned. + * + * To support the purely sequential writes in each zone, zoned XFS needs to be + * able to move data remaining in a zone out of it to reset the zone to prepare + * for writing to it again. + * + * This is done by the GC thread implemented in this file. To support that a + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to + * write the garbage collected data into. + * + * Whenever the available space is below the chosen threshold, the GC thread + * looks for potential non-empty but not fully used zones that are worth + * reclaiming. Once found the rmap for the victim zone is queried, and after + * a bit of sorting to reduce fragmentation, the still live extents are read + * into memory and written to the GC target zone, and the bmap btree of the + * files is updated to point to the new location. To avoid taking the IOLOCK + * and MMAPLOCK for the entire GC process and thus affecting the latency of + * user reads and writes to the files, the GC writes are speculative and the + * I/O completion checks that no other writes happened for the affected regions + * before remapping. + * + * Once a zone does not contain any valid data, be that through GC or user + * block removal, it is queued for for a zone reset. The reset operation + * carefully ensures that the RT device cache is flushed and all transactions + * referencing the rmap have been committed to disk. + */ + +/* + * Size of each GC scratch pad. This is also the upper bound for each + * GC I/O, which helps to keep latency down. + */ +#define XFS_GC_CHUNK_SIZE SZ_1M + +/* + * Scratchpad data to read GCed data into. + * + * The offset member tracks where the next allocation starts, and freed tracks + * the amount of space that is not used anymore. + */ +#define XFS_ZONE_GC_NR_SCRATCH 2 +struct xfs_zone_scratch { + struct folio *folio; + unsigned int offset; + unsigned int freed; +}; + +/* + * Chunk that is read and written for each GC operation. + * + * Note that for writes to actual zoned devices, the chunk can be split when + * reaching the hardware limit. + */ +struct xfs_gc_bio { + struct xfs_zone_gc_data *data; + + /* + * Entry into the reading/writing/resetting list. Only accessed from + * the GC thread, so no locking needed. + */ + struct list_head entry; + + /* + * State of this gc_bio. Done means the current I/O completed. + * Set from the bio end I/O handler, read from the GC thread. + */ + enum { + XFS_GC_BIO_NEW, + XFS_GC_BIO_DONE, + } state; + + /* + * Pointer to the inode and byte range in the inode that this + * GC chunk is operating on. + */ + struct xfs_inode *ip; + loff_t offset; + unsigned int len; + + /* + * Existing startblock (in the zone to be freed) and newly assigned + * daddr in the zone GCed into. + */ + xfs_fsblock_t old_startblock; + xfs_daddr_t new_daddr; + struct xfs_zone_scratch *scratch; + + /* Are we writing to a sequential write required zone? */ + bool is_seq; + + /* Open Zone being written to */ + struct xfs_open_zone *oz; + + /* Bio used for reads and writes, including the bvec used by it */ + struct bio_vec bv; + struct bio bio; /* must be last */ +}; + +#define XFS_ZONE_GC_RECS 1024 + +/* iterator, needs to be reinitialized for each victim zone */ +struct xfs_zone_gc_iter { + struct xfs_rtgroup *victim_rtg; + unsigned int rec_count; + unsigned int rec_idx; + xfs_agblock_t next_startblock; + struct xfs_rmap_irec *recs; +}; + +/* + * Per-mount GC state. + */ +struct xfs_zone_gc_data { + struct xfs_mount *mp; + + /* bioset used to allocate the gc_bios */ + struct bio_set bio_set; + + /* + * Scratchpad used, and index to indicated which one is used. + */ + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; + unsigned int scratch_idx; + + /* + * List of bios currently being read, written and reset. + * These lists are only accessed by the GC thread itself, and must only + * be processed in order. + */ + struct list_head reading; + struct list_head writing; + struct list_head resetting; + + /* + * Iterator for the victim zone. + */ + struct xfs_zone_gc_iter iter; +}; + +/* + * We aim to keep enough zones free in stock to fully use the open zone limit + * for data placement purposes. Additionally, the m_zonegc_low_space tunable + * can be set to make sure a fraction of the unused blocks are available for + * writing. + */ +bool +xfs_zoned_need_gc( + struct xfs_mount *mp) +{ + s64 available, free, threshold; + s32 remainder; + + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + return false; + + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + + if (available < + mp->m_groups[XG_TYPE_RTG].blocks * + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + + threshold = div_s64_rem(free, 100, &remainder); + threshold = threshold * mp->m_zonegc_low_space + + remainder * div_s64(mp->m_zonegc_low_space, 100); + + if (available < threshold) + return true; + + return false; +} + +static struct xfs_zone_gc_data * +xfs_zone_gc_data_alloc( + struct xfs_mount *mp) +{ + struct xfs_zone_gc_data *data; + int i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), + GFP_KERNEL); + if (!data->iter.recs) + goto out_free_data; + + /* + * We actually only need a single bio_vec. It would be nice to have + * a flag that only allocates the inline bvecs and not the separate + * bvec pool. + */ + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_recs; + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { + data->scratch[i].folio = + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); + if (!data->scratch[i].folio) + goto out_free_scratch; + } + INIT_LIST_HEAD(&data->reading); + INIT_LIST_HEAD(&data->writing); + INIT_LIST_HEAD(&data->resetting); + data->mp = mp; + return data; + +out_free_scratch: + while (--i >= 0) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); +out_free_recs: + kfree(data->iter.recs); +out_free_data: + kfree(data); + return NULL; +} + +static void +xfs_zone_gc_data_free( + struct xfs_zone_gc_data *data) +{ + int i; + + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); + kfree(data->iter.recs); + kfree(data); +} + +static void +xfs_zone_gc_iter_init( + struct xfs_zone_gc_iter *iter, + struct xfs_rtgroup *victim_rtg) + +{ + iter->next_startblock = 0; + iter->rec_count = 0; + iter->rec_idx = 0; + iter->victim_rtg = victim_rtg; +} + +/* + * Query the rmap of the victim zone to gather the records to evacuate. + */ +static int +xfs_zone_gc_query_cb( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec, + void *private) +{ + struct xfs_zone_gc_iter *iter = private; + + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); + + iter->recs[iter->rec_count] = *irec; + if (++iter->rec_count == XFS_ZONE_GC_RECS) { + iter->next_startblock = + irec->rm_startblock + irec->rm_blockcount; + return 1; + } + return 0; +} + +static int +xfs_zone_gc_rmap_rec_cmp( + const void *a, + const void *b) +{ + const struct xfs_rmap_irec *reca = a; + const struct xfs_rmap_irec *recb = b; + int diff; + + diff = cmp_int(reca->rm_owner, recb->rm_owner); + if (diff) + return diff; + return cmp_int(reca->rm_offset, recb->rm_offset); +} + +static int +xfs_zone_gc_query( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter) +{ + struct xfs_rtgroup *rtg = iter->victim_rtg; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_btree_cur *cur; + struct xfs_trans *tp; + int error; + + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); + if (iter->next_startblock == rtg_blocks(rtg)) + goto done; + + ASSERT(iter->next_startblock < rtg_blocks(rtg)); + ri_low.rm_startblock = iter->next_startblock; + memset(&ri_high, 0xFF, sizeof(ri_high)); + + iter->rec_idx = 0; + iter->rec_count = 0; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_zone_gc_query_cb, iter); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + xfs_btree_del_cursor(cur, error < 0 ? error : 0); + xfs_trans_cancel(tp); + + if (error < 0) + return error; + + /* + * Sort the rmap records by inode number and increasing offset to + * defragment the mappings. + * + * This could be further enhanced by an even bigger look ahead window, + * but that's better left until we have better detection of changes to + * inode mapping to avoid the potential of GCing already dead data. + */ + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), + xfs_zone_gc_rmap_rec_cmp, NULL); + + if (error == 0) { + /* + * We finished iterating through the zone. + */ + iter->next_startblock = rtg_blocks(rtg); + if (iter->rec_count == 0) + goto done; + } + + return 0; +done: + xfs_rtgroup_rele(iter->victim_rtg); + iter->victim_rtg = NULL; + return 0; +} + +static bool +xfs_zone_gc_iter_next( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter, + struct xfs_rmap_irec *chunk_rec, + struct xfs_inode **ipp) +{ + struct xfs_rmap_irec *irec; + int error; + + if (!iter->victim_rtg) + return false; + +retry: + if (iter->rec_idx == iter->rec_count) { + error = xfs_zone_gc_query(mp, iter); + if (error) + goto fail; + if (!iter->victim_rtg) + return false; + } + + irec = &iter->recs[iter->rec_idx]; + error = xfs_iget(mp, NULL, irec->rm_owner, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); + if (error) { + /* + * If the inode was already deleted, skip over it. + */ + if (error == -ENOENT) { + iter->rec_idx++; + goto retry; + } + goto fail; + } + + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { + iter->rec_idx++; + xfs_irele(*ipp); + goto retry; + } + + *chunk_rec = *irec; + return true; + +fail: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; +} + +static void +xfs_zone_gc_iter_advance( + struct xfs_zone_gc_iter *iter, + xfs_extlen_t count_fsb) +{ + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; + + irec->rm_offset += count_fsb; + irec->rm_startblock += count_fsb; + irec->rm_blockcount -= count_fsb; + if (!irec->rm_blockcount) + iter->rec_idx++; +} + +static struct xfs_rtgroup * +xfs_zone_gc_pick_victim_from( + struct xfs_mount *mp, + uint32_t bucket) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t victim_used = U32_MAX; + struct xfs_rtgroup *victim_rtg = NULL; + uint32_t bit; + + if (!zi->zi_used_bucket_entries[bucket]) + return NULL; + + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], + mp->m_sb.sb_rgcount) { + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); + + if (!rtg) + continue; + + /* skip zones that are just waiting for a reset */ + if (rtg_rmap(rtg)->i_used_blocks == 0 || + rtg_rmap(rtg)->i_used_blocks >= victim_used) { + xfs_rtgroup_rele(rtg); + continue; + } + + if (victim_rtg) + xfs_rtgroup_rele(victim_rtg); + victim_rtg = rtg; + victim_used = rtg_rmap(rtg)->i_used_blocks; + + /* + * Any zone that is less than 1 percent used is fair game for + * instant reclaim. All of these zones are in the last + * bucket, so avoid the expensive division for the zones + * in the other buckets. + */ + if (bucket == 0 && + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) + break; + } + + return victim_rtg; +} + +/* + * Iterate through all zones marked as reclaimable and find a candidate to + * reclaim. + */ +static bool +xfs_zone_gc_select_victim( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_rtgroup *victim_rtg = NULL; + unsigned int bucket; + + if (xfs_is_shutdown(mp)) + return false; + + if (iter->victim_rtg) + return true; + + /* + * Don't start new work if we are asked to stop or park. + */ + if (kthread_should_stop() || kthread_should_park()) + return false; + + if (!xfs_zoned_need_gc(mp)) + return false; + + spin_lock(&zi->zi_used_buckets_lock); + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); + if (victim_rtg) + break; + } + spin_unlock(&zi->zi_used_buckets_lock); + + if (!victim_rtg) + return false; + + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); + xfs_zone_gc_iter_init(iter, victim_rtg); + return true; +} + +static struct xfs_open_zone * +xfs_zone_gc_steal_open( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz, *found = NULL; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!found || + oz->oz_write_pointer < found->oz_write_pointer) + found = oz; + } + + if (found) { + found->oz_is_gc = true; + list_del_init(&found->oz_entry); + zi->zi_nr_open_zones--; + } + + spin_unlock(&zi->zi_open_zones_lock); + return found; +} + +static struct xfs_open_zone * +xfs_zone_gc_select_target( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = zi->zi_open_gc_zone; + + /* + * We need to wait for pending writes to finish. + */ + if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) + return NULL; + + ASSERT(zi->zi_nr_open_zones <= + mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (oz) + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_open_gc_zone = oz; + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +/* + * Ensure we have a valid open zone to write the GC data to. + * + * If the current target zone has space keep writing to it, else first wait for + * all pending writes and then pick a new one. + */ +static struct xfs_open_zone * +xfs_zone_gc_ensure_target( + struct xfs_mount *mp) +{ + struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; + + if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return xfs_zone_gc_select_target(mp); + return oz; +} + +static unsigned int +xfs_zone_gc_scratch_available( + struct xfs_zone_gc_data *data) +{ + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; +} + +static bool +xfs_zone_gc_space_available( + struct xfs_zone_gc_data *data) +{ + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(data->mp); + if (!oz) + return false; + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + xfs_zone_gc_scratch_available(data); +} + +static void +xfs_zone_gc_end_io( + struct bio *bio) +{ + struct xfs_gc_bio *chunk = + container_of(bio, struct xfs_gc_bio, bio); + struct xfs_zone_gc_data *data = chunk->data; + + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); + wake_up_process(data->mp->m_zone_info->zi_gc_thread); +} + +static struct xfs_open_zone * +xfs_zone_gc_alloc_blocks( + struct xfs_zone_gc_data *data, + xfs_extlen_t *count_fsb, + xfs_daddr_t *daddr, + bool *is_seq) +{ + struct xfs_mount *mp = data->mp; + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(mp); + if (!oz) + return NULL; + + *count_fsb = min(*count_fsb, + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); + + /* + * Directly allocate GC blocks from the reserved pool. + * + * If we'd take them from the normal pool we could be stealing blocks + * from a regular writer, which would then have to wait for GC and + * deadlock. + */ + spin_lock(&mp->m_sb_lock); + *count_fsb = min(*count_fsb, + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + *count_fsb = min3(*count_fsb, + mp->m_free[XC_FREE_RTEXTENTS].res_avail, + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; + spin_unlock(&mp->m_sb_lock); + + if (!*count_fsb) + return NULL; + + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); + if (!*is_seq) + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); + oz->oz_write_pointer += *count_fsb; + atomic_inc(&oz->oz_ref); + return oz; +} + +static bool +xfs_zone_gc_start_chunk( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_open_zone *oz; + struct xfs_rmap_irec irec; + struct xfs_gc_bio *chunk; + struct xfs_inode *ip; + struct bio *bio; + xfs_daddr_t daddr; + bool is_seq; + + if (xfs_is_shutdown(mp)) + return false; + + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) + return false; + oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq); + if (!oz) { + xfs_irele(ip); + return false; + } + + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->ip = ip; + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); + chunk->old_startblock = + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); + chunk->new_daddr = daddr; + chunk->is_seq = is_seq; + chunk->scratch = &data->scratch[data->scratch_idx]; + chunk->data = data; + chunk->oz = oz; + + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); + bio->bi_end_io = xfs_zone_gc_end_io; + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, + chunk->scratch->offset); + chunk->scratch->offset += chunk->len; + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { + data->scratch_idx = + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; + } + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->reading); + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); + + submit_bio(bio); + return true; +} + +static void +xfs_zone_gc_free_chunk( + struct xfs_gc_bio *chunk) +{ + list_del(&chunk->entry); + xfs_open_zone_put(chunk->oz); + xfs_irele(chunk->ip); + bio_put(&chunk->bio); +} + +static void +xfs_zone_gc_submit_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + if (chunk->is_seq) { + chunk->bio.bi_opf &= ~REQ_OP_WRITE; + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; + chunk->bio.bi_end_io = xfs_zone_gc_end_io; + submit_bio(&chunk->bio); +} + +static struct xfs_gc_bio * +xfs_zone_gc_split_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + struct queue_limits *lim = + &bdev_get_queue(chunk->bio.bi_bdev)->limits; + struct xfs_gc_bio *split_chunk; + int split_sectors; + unsigned int split_len; + struct bio *split; + unsigned int nsegs; + + if (!chunk->is_seq) + return NULL; + + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (!split_sectors) + return NULL; + + /* ensure the split chunk is still block size aligned */ + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; + split_len = split_sectors << SECTOR_SHIFT; + + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); + split_chunk = container_of(split, struct xfs_gc_bio, bio); + split_chunk->data = data; + ihold(VFS_I(chunk->ip)); + split_chunk->ip = chunk->ip; + split_chunk->is_seq = chunk->is_seq; + split_chunk->scratch = chunk->scratch; + split_chunk->offset = chunk->offset; + split_chunk->len = split_len; + split_chunk->old_startblock = chunk->old_startblock; + split_chunk->new_daddr = chunk->new_daddr; + split_chunk->oz = chunk->oz; + atomic_inc(&chunk->oz->oz_ref); + + chunk->offset += split_len; + chunk->len -= split_len; + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); + + /* add right before the original chunk */ + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&split_chunk->entry, &chunk->entry); + return split_chunk; +} + +static void +xfs_zone_gc_write_chunk( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_mount *mp = chunk->ip->i_mount; + phys_addr_t bvec_paddr = + bvec_phys(bio_first_bvec_all(&chunk->bio)); + struct xfs_gc_bio *split_chunk; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_move_tail(&chunk->entry, &data->writing); + + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, + offset_in_folio(chunk->scratch->folio, bvec_paddr)); + + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) + xfs_zone_gc_submit_write(data, split_chunk); + xfs_zone_gc_submit_write(data, chunk); +} + +static void +xfs_zone_gc_finish_chunk( + struct xfs_gc_bio *chunk) +{ + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_inode *ip = chunk->ip; + struct xfs_mount *mp = ip->i_mount; + int error; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + chunk->scratch->freed += chunk->len; + if (chunk->scratch->freed == chunk->scratch->offset) { + chunk->scratch->offset = 0; + chunk->scratch->freed = 0; + } + + /* + * Cycle through the iolock and wait for direct I/O and layouts to + * ensure no one is reading from the old mapping before it goes away. + * + * Note that xfs_zoned_end_io() below checks that no other writer raced + * with us to update the mapping by checking that the old startblock + * didn't change. + */ + xfs_ilock(ip, iolock); + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); + if (!error) + inode_dio_wait(VFS_I(ip)); + xfs_iunlock(ip, iolock); + if (error) + goto free; + + if (chunk->is_seq) + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, + chunk->new_daddr, chunk->oz, chunk->old_startblock); +free: + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_zone_gc_free_chunk(chunk); +} + +static void +xfs_zone_gc_finish_reset( + struct xfs_gc_bio *chunk) +{ + struct xfs_rtgroup *rtg = chunk->bio.bi_private; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + + if (chunk->bio.bi_status) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out; + } + + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + atomic_inc(&zi->zi_nr_free_zones); + + xfs_zoned_add_available(mp, rtg_blocks(rtg)); + + wake_up_all(&zi->zi_zone_wait); +out: + list_del(&chunk->entry); + bio_put(&chunk->bio); +} + +static bool +xfs_zone_gc_prepare_reset( + struct bio *bio, + struct xfs_rtgroup *rtg) +{ + trace_xfs_zone_reset(rtg); + + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + if (!bdev_max_discard_sectors(bio->bi_bdev)) + return false; + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; + bio->bi_iter.bi_size = + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); + } + + return true; +} + +int +xfs_zone_gc_reset_sync( + struct xfs_rtgroup *rtg) +{ + int error = 0; + struct bio bio; + + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, + REQ_OP_ZONE_RESET); + if (xfs_zone_gc_prepare_reset(&bio, rtg)) + error = submit_bio_wait(&bio); + bio_uninit(&bio); + + return error; +} + +static void +xfs_zone_gc_reset_zones( + struct xfs_zone_gc_data *data, + struct xfs_group *reset_list) +{ + struct xfs_group *next = reset_list; + + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); + return; + } + + do { + struct xfs_rtgroup *rtg = to_rtg(next); + struct xfs_gc_bio *chunk; + struct bio *bio; + + xfs_log_force_inode(rtg_rmap(rtg)); + + next = rtg_group(rtg)->xg_next_reset; + rtg_group(rtg)->xg_next_reset = NULL; + + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); + bio->bi_private = rtg; + bio->bi_end_io = xfs_zone_gc_end_io; + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->data = data; + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->resetting); + + /* + * Also use the bio to drive the state machine when neither + * zone reset nor discard is supported to keep things simple. + */ + if (xfs_zone_gc_prepare_reset(bio, rtg)) + submit_bio(bio); + else + bio_endio(bio); + } while (next); +} + +/* + * Handle the work to read and write data for GC and to reset the zones, + * including handling all completions. + * + * Note that the order of the chunks is preserved so that we don't undo the + * optimal order established by xfs_zone_gc_query(). + */ +static bool +xfs_zone_gc_handle_work( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_gc_bio *chunk, *next; + struct xfs_group *reset_list; + struct blk_plug plug; + + spin_lock(&zi->zi_reset_list_lock); + reset_list = zi->zi_reset_list; + zi->zi_reset_list = NULL; + spin_unlock(&zi->zi_reset_list_lock); + + if (!xfs_zone_gc_select_victim(data) || + !xfs_zone_gc_space_available(data)) { + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !reset_list) + return false; + } + + __set_current_state(TASK_RUNNING); + try_to_freeze(); + + if (reset_list) + xfs_zone_gc_reset_zones(data, reset_list); + + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_reset(chunk); + } + + list_for_each_entry_safe(chunk, next, &data->writing, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_chunk(chunk); + } + + blk_start_plug(&plug); + list_for_each_entry_safe(chunk, next, &data->reading, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_write_chunk(chunk); + } + blk_finish_plug(&plug); + + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + return true; +} + +/* + * Note that the current GC algorithm would break reflinks and thus duplicate + * data that was shared by multiple owners before. Because of that reflinks + * are currently not supported on zoned file systems and can't be created or + * mounted. + */ +static int +xfs_zoned_gcd( + void *private) +{ + struct xfs_zone_gc_data *data = private; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + xfs_set_zonegc_running(mp); + if (xfs_zone_gc_handle_work(data)) + continue; + + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !zi->zi_reset_list) { + xfs_clear_zonegc_running(mp); + xfs_zoned_resv_wake_all(mp); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + kthread_parkme(); + continue; + } + } + + schedule(); + } + xfs_clear_zonegc_running(mp); + + if (data->iter.victim_rtg) + xfs_rtgroup_rele(data->iter.victim_rtg); + + memalloc_nofs_restore(nofs_flag); + xfs_zone_gc_data_free(data); + return 0; +} + +void +xfs_zone_gc_start( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_unpark(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_stop( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_park(mp->m_zone_info->zi_gc_thread); +} + +int +xfs_zone_gc_mount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_gc_data *data; + struct xfs_open_zone *oz; + int error; + + /* + * If there are no free zones available for GC, pick the open zone with + * the least used space to GC into. This should only happen after an + * unclean shutdown near ENOSPC while GC was ongoing. + * + * We also need to do this for the first gc zone allocation if we + * unmounted while at the open limit. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || + zi->zi_nr_open_zones == mp->m_max_open_zones) + oz = xfs_zone_gc_steal_open(zi); + else + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (!oz) { + xfs_warn(mp, "unable to allocate a zone for gc"); + error = -EIO; + goto out; + } + + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + zi->zi_open_gc_zone = oz; + + data = xfs_zone_gc_data_alloc(mp); + if (!data) { + error = -ENOMEM; + goto out_put_gc_zone; + } + + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + "xfs-zone-gc/%s", mp->m_super->s_id); + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + xfs_warn(mp, "unable to create zone gc thread"); + error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + goto out_free_gc_data; + } + + /* xfs_zone_gc_start will unpark for rw mounts */ + kthread_park(mp->m_zone_info->zi_gc_thread); + return 0; + +out_free_gc_data: + kfree(data); +out_put_gc_zone: + xfs_open_zone_put(zi->zi_open_gc_zone); +out: + return error; +} + +void +xfs_zone_gc_unmount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + kthread_stop(zi->zi_gc_thread); + if (zi->zi_open_gc_zone) + xfs_open_zone_put(zi->zi_open_gc_zone); +} diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c new file mode 100644 index 000000000000..733bcc2f8645 --- /dev/null +++ b/fs/xfs/xfs_zone_info.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" + +static const char xfs_write_hint_shorthand[6][16] = { + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"}; + +static inline const char * +xfs_write_hint_to_str( + uint8_t write_hint) +{ + if (write_hint > WRITE_LIFE_EXTREME) + return "UNKNOWN"; + return xfs_write_hint_shorthand[write_hint]; +} + +static void +xfs_show_open_zone( + struct seq_file *m, + struct xfs_open_zone *oz) +{ + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", + rtg_rgno(oz->oz_rtg), + oz->oz_write_pointer, oz->oz_written, + rtg_rmap(oz->oz_rtg)->i_used_blocks, + xfs_write_hint_to_str(oz->oz_write_hint)); +} + +static void +xfs_show_full_zone_used_distribution( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int reclaimable = 0, full, i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + unsigned int entries = zi->zi_used_bucket_entries[i]; + + seq_printf(m, "\t %2u..%2u%%: %u\n", + i * (100 / XFS_ZONE_USED_BUCKETS), + (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1, + entries); + reclaimable += entries; + } + spin_unlock(&zi->zi_used_buckets_lock); + + full = mp->m_sb.sb_rgcount; + if (zi->zi_open_gc_zone) + full--; + full -= zi->zi_nr_open_zones; + full -= atomic_read(&zi->zi_nr_free_zones); + full -= reclaimable; + + seq_printf(m, "\t 100%%: %u\n", full); +} + +void +xfs_zoned_show_stats( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + seq_puts(m, "\n"); + + seq_printf(m, "\tuser free RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + seq_printf(m, "\treserved free RT blocks: %lld\n", + mp->m_free[XC_FREE_RTEXTENTS].res_avail); + seq_printf(m, "\tuser available RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE)); + seq_printf(m, "\treserved available RT blocks: %lld\n", + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + seq_printf(m, "\tRT reservations required: %d\n", + !list_empty_careful(&zi->zi_reclaim_reservations)); + seq_printf(m, "\tRT GC required: %d\n", + xfs_zoned_need_gc(mp)); + + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); + seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + xfs_show_open_zone(m, oz); + if (zi->zi_open_gc_zone) { + seq_puts(m, "\topen gc zone:\n"); + xfs_show_open_zone(m, zi->zi_open_gc_zone); + } + spin_unlock(&zi->zi_open_zones_lock); + seq_puts(m, "\tused blocks distribution (fully written zones):\n"); + xfs_show_full_zone_used_distribution(m, mp); +} diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h new file mode 100644 index 000000000000..ab696975a993 --- /dev/null +++ b/fs/xfs/xfs_zone_priv.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_PRIV_H +#define _XFS_ZONE_PRIV_H + +struct xfs_open_zone { + /* + * Entry in the open zone list and refcount. Protected by + * zi_open_zones_lock in struct xfs_zone_info. + */ + struct list_head oz_entry; + atomic_t oz_ref; + + /* + * oz_write_pointer is the write pointer at which space is handed out + * for conventional zones, or simple the count of blocks handed out + * so far for sequential write required zones and is protected by + * oz_alloc_lock/ + */ + spinlock_t oz_alloc_lock; + xfs_rgblock_t oz_write_pointer; + + /* + * oz_written is the number of blocks for which we've received a + * write completion. oz_written must always be <= oz_write_pointer + * and is protected by the ILOCK of the rmap inode. + */ + xfs_rgblock_t oz_written; + + /* + * Write hint (data temperature) assigned to this zone, or + * WRITE_LIFE_NOT_SET if none was set. + */ + enum rw_hint oz_write_hint; + + /* + * Is this open zone used for garbage collection? There can only be a + * single open GC zone, which is pointed to by zi_open_gc_zone in + * struct xfs_zone_info. Constant over the life time of an open zone. + */ + bool oz_is_gc; + + /* + * Pointer to the RT groups structure for this open zone. Constant over + * the life time of an open zone. + */ + struct xfs_rtgroup *oz_rtg; +}; + +/* + * Number of bitmap buckets to track reclaimable zones. There are 10 buckets + * so that each 10% of the usable capacity get their own bucket and GC can + * only has to walk the bitmaps of the lesser used zones if there are any. + */ +#define XFS_ZONE_USED_BUCKETS 10u + +struct xfs_zone_info { + /* + * List of pending space reservations: + */ + spinlock_t zi_reservation_lock; + struct list_head zi_reclaim_reservations; + + /* + * List and number of open zones: + */ + spinlock_t zi_open_zones_lock; + struct list_head zi_open_zones; + unsigned int zi_nr_open_zones; + + /* + * Free zone search cursor and number of free zones: + */ + unsigned long zi_free_zone_cursor; + atomic_t zi_nr_free_zones; + + /* + * Wait queue to wait for free zones or open zone resources to become + * available: + */ + wait_queue_head_t zi_zone_wait; + + /* + * Pointer to the GC thread, and the current open zone used by GC + * (if any). + * + * zi_open_gc_zone is mostly private to the GC thread, but can be read + * for debugging from other threads, in which case zi_open_zones_lock + * must be taken to access it. + */ + struct task_struct *zi_gc_thread; + struct xfs_open_zone *zi_open_gc_zone; + + /* + * List of zones that need a reset: + */ + spinlock_t zi_reset_list_lock; + struct xfs_group *zi_reset_list; + + /* + * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help + * garbage collection to quickly find the best candidate for reclaim. + */ + spinlock_t zi_used_buckets_lock; + unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS]; + unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS]; + +}; + +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, + enum rw_hint write_hint, bool is_gc); + +int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); +bool xfs_zoned_need_gc(struct xfs_mount *mp); +int xfs_zone_gc_mount(struct xfs_mount *mp); +void xfs_zone_gc_unmount(struct xfs_mount *mp); + +void xfs_zoned_resv_wake_all(struct xfs_mount *mp); + +#endif /* _XFS_ZONE_PRIV_H */ diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c new file mode 100644 index 000000000000..93c9a7721139 --- /dev/null +++ b/fs/xfs/xfs_zone_space_resv.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" + +/* + * Note: the zoned allocator does not support a rtextsize > 1, so this code and + * the allocator itself uses file system blocks interchangeable with realtime + * extents without doing the otherwise required conversions. + */ + +/* + * Per-task space reservation. + * + * Tasks that need to wait for GC to free up space allocate one of these + * on-stack and adds it to the per-mount zi_reclaim_reservations lists. + * The GC thread will then wake the tasks in order when space becomes available. + */ +struct xfs_zone_reservation { + struct list_head entry; + struct task_struct *task; + xfs_filblks_t count_fsb; +}; + +/* + * Calculate the number of reserved blocks. + * + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly + * available for writes without waiting for GC. + * + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS + * is further restricted by at least one zone as well as the optional + * persistently reserved blocks. This allows the allocator to run more + * smoothly by not always triggering GC. + */ +uint64_t +xfs_zoned_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + switch (ctr) { + case XC_FREE_RTEXTENTS: + return (uint64_t)XFS_RESERVED_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks + + mp->m_sb.sb_rtreserved; + case XC_FREE_RTAVAILABLE: + return (uint64_t)XFS_GC_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks; + default: + ASSERT(0); + return 0; + } +} + +void +xfs_zoned_resv_wake_all( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + spin_lock(&zi->zi_reservation_lock); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) + wake_up_process(reservation->task); + spin_unlock(&zi->zi_reservation_lock); +} + +void +xfs_zoned_add_available( + struct xfs_mount *mp, + xfs_filblks_t count_fsb) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + if (list_empty_careful(&zi->zi_reclaim_reservations)) { + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + return; + } + + spin_lock(&zi->zi_reservation_lock); + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { + if (reservation->count_fsb > count_fsb) + break; + wake_up_process(reservation->task); + count_fsb -= reservation->count_fsb; + + } + spin_unlock(&zi->zi_reservation_lock); +} + +static int +xfs_zoned_space_wait_error( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return -EIO; + if (fatal_signal_pending(current)) + return -EINTR; + return 0; +} + +static int +xfs_zoned_reserve_available( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation reservation = { + .task = current, + .count_fsb = count_fsb, + }; + int error; + + /* + * If there are no waiters, try to directly grab the available blocks + * from the percpu counter. + * + * If the caller wants to dip into the reserved pool also bypass the + * wait list. This relies on the fact that we have a very graciously + * sized reserved pool that always has enough space. If the reserved + * allocations fail we're in trouble. + */ + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || + (flags & XFS_ZR_RESERVED))) { + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + return error; + } + + if (flags & XFS_ZR_NOWAIT) + return -EAGAIN; + + spin_lock(&zi->zi_reservation_lock); + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); + while ((error = xfs_zoned_space_wait_error(mp)) == 0) { + set_current_state(TASK_KILLABLE); + + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + break; + + /* + * Make sure to start GC if it is not running already. As we + * check the rtavailable count when filling up zones, GC is + * normally already running at this point, but in some setups + * with very few zones we may completely run out of non- + * reserved blocks in between filling zones. + */ + if (!xfs_is_zonegc_running(mp)) + wake_up_process(zi->zi_gc_thread); + + /* + * If there is no reclaimable group left and we aren't still + * processing a pending GC request give up as we're fully out + * of space. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + !xfs_is_zonegc_running(mp)) + break; + + spin_unlock(&zi->zi_reservation_lock); + schedule(); + spin_lock(&zi->zi_reservation_lock); + } + list_del(&reservation.entry); + spin_unlock(&zi->zi_reservation_lock); + + __set_current_state(TASK_RUNNING); + return error; +} + +/* + * Implement greedy space allocation for short writes by trying to grab all + * that is left after locking out other threads from trying to do the same. + * + * This isn't exactly optimal and can hopefully be replaced by a proper + * percpu_counter primitive one day. + */ +static int +xfs_zoned_reserve_extents_greedy( + struct xfs_inode *ip, + xfs_filblks_t *count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + s64 len = *count_fsb; + int error = -ENOSPC; + + spin_lock(&zi->zi_reservation_lock); + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + if (len > 0) { + *count_fsb = len; + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, + flags & XFS_ZR_RESERVED); + } + spin_unlock(&zi->zi_reservation_lock); + return error; +} + +int +xfs_zoned_space_reserve( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + ASSERT(ac->reserved_blocks == 0); + ASSERT(ac->open_zone == NULL); + + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + if (error) + return error; + + error = xfs_zoned_reserve_available(ip, count_fsb, flags); + if (error) { + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); + return error; + } + ac->reserved_blocks = count_fsb; + return 0; +} + +void +xfs_zoned_space_unreserve( + struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac) +{ + if (ac->reserved_blocks > 0) { + struct xfs_mount *mp = ip->i_mount; + + xfs_zoned_add_available(mp, ac->reserved_blocks); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); + } + if (ac->open_zone) + xfs_open_zone_put(ac->open_zone); +} |