summaryrefslogtreecommitdiff
path: root/fs/xfs/libxfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r--fs/xfs/libxfs/xfs_ag.c258
-rw-r--r--fs/xfs/libxfs/xfs_ag.h205
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c25
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c129
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h19
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c30
-rw-r--r--fs/xfs/libxfs/xfs_attr.c3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c487
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h15
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c111
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c482
-rw-r--r--fs/xfs/libxfs/xfs_btree.h33
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.c7
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c10
-rw-r--r--fs/xfs/libxfs/xfs_defer.c6
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c9
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c190
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h262
-rw-r--r--fs/xfs/libxfs/xfs_fs.h71
-rw-r--r--fs/xfs/libxfs/xfs_group.c225
-rw-r--r--fs/xfs/libxfs/xfs_group.h183
-rw-r--r--fs/xfs/libxfs/xfs_health.h93
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c195
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c178
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h3
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c201
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h6
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c7
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h29
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c4
-rw-r--r--fs/xfs/libxfs/xfs_metadir.c485
-rw-r--r--fs/xfs/libxfs/xfs_metadir.h47
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c322
-rw-r--r--fs/xfs/libxfs/xfs_metafile.h44
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h192
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h43
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c297
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h25
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c17
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c206
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h18
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c28
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c397
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h256
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.c750
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h368
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c757
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.h189
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c1054
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.h212
-rw-r--r--fs/xfs/libxfs/xfs_sb.c375
-rw-r--r--fs/xfs/libxfs/xfs_sb.h6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h25
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c364
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h25
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h13
-rw-r--r--fs/xfs/libxfs/xfs_types.c44
-rw-r--r--fs/xfs/libxfs/xfs_types.h51
-rw-r--r--fs/xfs/libxfs/xfs_zones.c186
-rw-r--r--fs/xfs/libxfs/xfs_zones.h35
69 files changed, 8726 insertions, 1647 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5ca8d0106827..e6ba914f6d06 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -30,86 +30,7 @@
#include "xfs_trace.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
-
-
-/*
- * Passive reference counting access wrappers to the perag structures. If the
- * per-ag structure is to be freed, the freeing code is responsible for cleaning
- * up objects with passive references before freeing the structure. This is
- * things like cached buffers.
- */
-struct xfs_perag *
-xfs_perag_get(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- rcu_read_lock();
- pag = xa_load(&mp->m_perags, agno);
- if (pag) {
- trace_xfs_perag_get(pag, _RET_IP_);
- ASSERT(atomic_read(&pag->pag_ref) >= 0);
- atomic_inc(&pag->pag_ref);
- }
- rcu_read_unlock();
- return pag;
-}
-
-/* Get a passive reference to the given perag. */
-struct xfs_perag *
-xfs_perag_hold(
- struct xfs_perag *pag)
-{
- ASSERT(atomic_read(&pag->pag_ref) > 0 ||
- atomic_read(&pag->pag_active_ref) > 0);
-
- trace_xfs_perag_hold(pag, _RET_IP_);
- atomic_inc(&pag->pag_ref);
- return pag;
-}
-
-void
-xfs_perag_put(
- struct xfs_perag *pag)
-{
- trace_xfs_perag_put(pag, _RET_IP_);
- ASSERT(atomic_read(&pag->pag_ref) > 0);
- atomic_dec(&pag->pag_ref);
-}
-
-/*
- * Active references for perag structures. This is for short term access to the
- * per ag structures for walking trees or accessing state. If an AG is being
- * shrunk or is offline, then this will fail to find that AG and return NULL
- * instead.
- */
-struct xfs_perag *
-xfs_perag_grab(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- rcu_read_lock();
- pag = xa_load(&mp->m_perags, agno);
- if (pag) {
- trace_xfs_perag_grab(pag, _RET_IP_);
- if (!atomic_inc_not_zero(&pag->pag_active_ref))
- pag = NULL;
- }
- rcu_read_unlock();
- return pag;
-}
-
-void
-xfs_perag_rele(
- struct xfs_perag *pag)
-{
- trace_xfs_perag_rele(pag, _RET_IP_);
- if (atomic_dec_and_test(&pag->pag_active_ref))
- wake_up(&pag->pag_active_wq);
-}
+#include "xfs_group.h"
/*
* xfs_initialize_perag_data
@@ -184,6 +105,18 @@ out:
return error;
}
+static void
+xfs_perag_uninit(
+ struct xfs_group *xg)
+{
+#ifdef __KERNEL__
+ struct xfs_perag *pag = to_perag(xg);
+
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
+#endif
+}
+
/*
* Free up the per-ag resources within the specified AG range.
*/
@@ -196,22 +129,8 @@ xfs_free_perag_range(
{
xfs_agnumber_t agno;
- for (agno = first_agno; agno < end_agno; agno++) {
- struct xfs_perag *pag = xa_erase(&mp->m_perags, agno);
-
- ASSERT(pag);
- XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
- xfs_defer_drain_free(&pag->pag_intents_drain);
-
- cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_cache_destroy(&pag->pag_bcache);
-
- /* drop the mount's active reference */
- xfs_perag_rele(pag);
- XFS_IS_CORRUPT(pag->pag_mount,
- atomic_read(&pag->pag_active_ref) != 0);
- kfree_rcu_mightsleep(pag);
- }
+ for (agno = first_agno; agno < end_agno; agno++)
+ xfs_group_free(mp, agno, XG_TYPE_AG, xfs_perag_uninit);
}
/* Find the size of the AG, in blocks. */
@@ -273,6 +192,10 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
+/*
+ * Update the perag of the previous tail AG if it has been changed during
+ * recovery (i.e. recovery of a growfs).
+ */
int
xfs_update_last_ag_size(
struct xfs_mount *mp,
@@ -282,88 +205,88 @@ xfs_update_last_ag_size(
if (!pag)
return -EFSCORRUPTED;
- pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1,
- mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
- __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp,
+ prev_agcount - 1, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks);
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
&pag->agino_max);
xfs_perag_rele(pag);
return 0;
}
-int
-xfs_initialize_perag(
+static int
+xfs_perag_alloc(
struct xfs_mount *mp,
- xfs_agnumber_t old_agcount,
- xfs_agnumber_t new_agcount,
- xfs_rfsblock_t dblocks,
- xfs_agnumber_t *maxagi)
+ xfs_agnumber_t index,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks)
{
struct xfs_perag *pag;
- xfs_agnumber_t index;
int error;
- for (index = old_agcount; index < new_agcount; index++) {
- pag = kzalloc(sizeof(*pag), GFP_KERNEL);
- if (!pag) {
- error = -ENOMEM;
- goto out_unwind_new_pags;
- }
- pag->pag_agno = index;
- pag->pag_mount = mp;
-
- error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL);
- if (error) {
- WARN_ON_ONCE(error == -EBUSY);
- goto out_free_pag;
- }
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL);
+ if (!pag)
+ return -ENOMEM;
#ifdef __KERNEL__
- /* Place kernel structure only init below this point. */
- spin_lock_init(&pag->pag_ici_lock);
- spin_lock_init(&pag->pagb_lock);
- spin_lock_init(&pag->pag_state_lock);
- INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
- INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- xfs_defer_drain_init(&pag->pag_intents_drain);
- init_waitqueue_head(&pag->pagb_wait);
- init_waitqueue_head(&pag->pag_active_wq);
- pag->pagb_count = 0;
- pag->pagb_tree = RB_ROOT;
- xfs_hooks_init(&pag->pag_rmap_update_hooks);
+ /* Place kernel structure only init below this point. */
+ spin_lock_init(&pag->pag_ici_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
#endif /* __KERNEL__ */
- error = xfs_buf_cache_init(&pag->pag_bcache);
- if (error)
- goto out_remove_pag;
-
- /* Active ref owned by mount indicates AG is online. */
- atomic_set(&pag->pag_active_ref, 1);
+ error = xfs_buf_cache_init(&pag->pag_bcache);
+ if (error)
+ goto out_free_perag;
- /*
- * Pre-calculated geometry
- */
- pag->block_count = __xfs_ag_block_count(mp, index, new_agcount,
+ /*
+ * Pre-calculated geometry
+ */
+ pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, index, agcount,
dblocks);
- pag->min_block = XFS_AGFL_BLOCK(mp);
- __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
- &pag->agino_max);
- }
+ pag_group(pag)->xg_min_gbno = XFS_AGFL_BLOCK(mp) + 1;
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
+ &pag->agino_max);
- index = xfs_set_inode_alloc(mp, new_agcount);
+ error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
+ if (error)
+ goto out_buf_cache_destroy;
- if (maxagi)
- *maxagi = index;
+ return 0;
+
+out_buf_cache_destroy:
+ xfs_buf_cache_destroy(&pag->pag_bcache);
+out_free_perag:
+ kfree(pag);
+ return error;
+}
+int
+xfs_initialize_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t orig_agcount,
+ xfs_agnumber_t new_agcount,
+ xfs_rfsblock_t dblocks,
+ xfs_agnumber_t *maxagi)
+{
+ xfs_agnumber_t index;
+ int error;
+
+ if (orig_agcount >= new_agcount)
+ return 0;
+
+ for (index = orig_agcount; index < new_agcount; index++) {
+ error = xfs_perag_alloc(mp, index, new_agcount, dblocks);
+ if (error)
+ goto out_unwind_new_pags;
+ }
+
+ *maxagi = xfs_set_inode_alloc(mp, new_agcount);
mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
return 0;
-out_remove_pag:
- xfs_defer_drain_free(&pag->pag_intents_drain);
- pag = xa_erase(&mp->m_perags, index);
-out_free_pag:
- kfree(pag);
out_unwind_new_pags:
- xfs_free_perag_range(mp, old_agcount, index);
+ xfs_free_perag_range(mp, orig_agcount, index);
return error;
}
@@ -378,7 +301,7 @@ xfs_get_aghdr_buf(
struct xfs_buf *bp;
int error;
- error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+ error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp);
if (error)
return error;
@@ -818,7 +741,7 @@ xfs_ag_shrink_space(
struct xfs_trans **tpp,
xfs_extlen_t delta)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_alloc_arg args = {
.tp = *tpp,
.mp = mp,
@@ -835,7 +758,7 @@ xfs_ag_shrink_space(
xfs_agblock_t aglen;
int error, err2;
- ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+ ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1);
error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp);
if (error)
return error;
@@ -872,7 +795,7 @@ xfs_ag_shrink_space(
/* internal log shouldn't also show up in the free space btrees */
error = xfs_alloc_vextent_exact_bno(&args,
- XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta));
+ xfs_agbno_to_fsb(pag, aglen - delta));
if (!error && args.agbno == NULLAGBLOCK)
error = -ENOSPC;
@@ -931,9 +854,9 @@ xfs_ag_shrink_space(
}
/* Update perag geometry */
- pag->block_count -= delta;
- __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
- &pag->agino_max);
+ pag_group(pag)->xg_block_count -= delta;
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
+ &pag->agino_max);
xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
@@ -958,12 +881,13 @@ xfs_ag_extend_space(
struct xfs_trans *tp,
xfs_extlen_t len)
{
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
int error;
- ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+ ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1);
error = xfs_ialloc_read_agi(pag, tp, 0, &bp);
if (error)
@@ -1002,9 +926,9 @@ xfs_ag_extend_space(
return error;
/* Update perag geometry */
- pag->block_count = be32_to_cpu(agf->agf_length);
- __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
- &pag->agino_max);
+ pag_group(pag)->xg_block_count = be32_to_cpu(agf->agf_length);
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
+ &pag->agino_max);
return 0;
}
@@ -1031,7 +955,7 @@ xfs_ag_get_geometry(
/* Fill out form. */
memset(ageo, 0, sizeof(*ageo));
- ageo->ag_number = pag->pag_agno;
+ ageo->ag_number = pag_agno(pag);
agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 9edfe0e96439..1f24cfa27321 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -7,6 +7,8 @@
#ifndef __LIBXFS_AG_H
#define __LIBXFS_AG_H 1
+#include "xfs_group.h"
+
struct xfs_mount;
struct xfs_trans;
struct xfs_perag;
@@ -30,11 +32,7 @@ struct xfs_ag_resv {
* performance of allocation group selection.
*/
struct xfs_perag {
- struct xfs_mount *pag_mount; /* owner filesystem */
- xfs_agnumber_t pag_agno; /* AG this structure belongs to */
- atomic_t pag_ref; /* passive reference count */
- atomic_t pag_active_ref; /* active reference count */
- wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
+ struct xfs_group pag_group;
unsigned long pag_opstate;
uint8_t pagf_bno_level; /* # of levels in bno btree */
uint8_t pagf_cnt_level; /* # of levels in cnt btree */
@@ -55,7 +53,6 @@ struct xfs_perag {
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
- int pagb_count; /* pagb slots in use */
uint8_t pagf_refcount_level; /* recount btree height */
/* Blocks reserved for all kinds of metadata. */
@@ -64,21 +61,12 @@ struct xfs_perag {
struct xfs_ag_resv pag_rmapbt_resv;
/* Precalculated geometry info */
- xfs_agblock_t block_count;
- xfs_agblock_t min_block;
xfs_agino_t agino_min;
xfs_agino_t agino_max;
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
- /*
- * Bitsets of per-ag metadata that have been checked and/or are sick.
- * Callers should hold pag_state_lock before accessing this field.
- */
- uint16_t pag_checked;
- uint16_t pag_sick;
-
#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Alternate btree heights so that online repair won't trip the write
@@ -90,13 +78,6 @@ struct xfs_perag {
uint8_t pagf_repair_rmap_level;
#endif
- spinlock_t pag_state_lock;
-
- spinlock_t pagb_lock; /* lock for pagb_tree */
- struct rb_root pagb_tree; /* ordered tree of busy extents */
- unsigned int pagb_gen; /* generation count for pagb_tree */
- wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
-
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
spinlock_t pag_ici_lock; /* incore inode cache lock */
@@ -108,21 +89,29 @@ struct xfs_perag {
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
-
- /*
- * We use xfs_drain to track the number of deferred log intent items
- * that have been queued (but not yet processed) so that waiters (e.g.
- * scrub) will not lock resources when other threads are in the middle
- * of processing a chain of intent items only to find momentary
- * inconsistencies.
- */
- struct xfs_defer_drain pag_intents_drain;
-
- /* Hook to feed rmapbt updates to an active online repair. */
- struct xfs_hooks pag_rmap_update_hooks;
#endif /* __KERNEL__ */
};
+static inline struct xfs_perag *to_perag(struct xfs_group *xg)
+{
+ return container_of(xg, struct xfs_perag, pag_group);
+}
+
+static inline struct xfs_group *pag_group(struct xfs_perag *pag)
+{
+ return &pag->pag_group;
+}
+
+static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag)
+{
+ return pag->pag_group.xg_mount;
+}
+
+static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag)
+{
+ return pag->pag_group.xg_gno;
+}
+
/*
* Per-AG operational state. These are atomic flag bits.
*/
@@ -144,8 +133,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
-int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount,
- xfs_agnumber_t agcount, xfs_rfsblock_t dcount,
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount,
+ xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount,
xfs_agnumber_t *maxagi);
void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno,
xfs_agnumber_t end_agno);
@@ -153,13 +142,71 @@ int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount);
/* Passive AG references */
-struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag);
-void xfs_perag_put(struct xfs_perag *pag);
+static inline struct xfs_perag *
+xfs_perag_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG));
+}
+
+static inline struct xfs_perag *
+xfs_perag_hold(
+ struct xfs_perag *pag)
+{
+ return to_perag(xfs_group_hold(pag_group(pag)));
+}
+
+static inline void
+xfs_perag_put(
+ struct xfs_perag *pag)
+{
+ xfs_group_put(pag_group(pag));
+}
/* Active AG references */
-struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t);
-void xfs_perag_rele(struct xfs_perag *pag);
+static inline struct xfs_perag *
+xfs_perag_grab(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG));
+}
+
+static inline void
+xfs_perag_rele(
+ struct xfs_perag *pag)
+{
+ xfs_group_rele(pag_group(pag));
+}
+
+static inline struct xfs_perag *
+xfs_perag_next_range(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_agnumber_t start_agno,
+ xfs_agnumber_t end_agno)
+{
+ return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL,
+ start_agno, end_agno, XG_TYPE_AG));
+}
+
+static inline struct xfs_perag *
+xfs_perag_next_from(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_agnumber_t start_agno)
+{
+ return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1);
+}
+
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ return xfs_perag_next_from(mp, pag, 0);
+}
/*
* Per-ag geometry infomation and validation
@@ -171,11 +218,7 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
static inline bool
xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
{
- if (agbno >= pag->block_count)
- return false;
- if (agbno <= pag->min_block)
- return false;
- return true;
+ return xfs_verify_gbno(pag_group(pag), agbno);
}
static inline bool
@@ -184,13 +227,7 @@ xfs_verify_agbext(
xfs_agblock_t agbno,
xfs_agblock_t len)
{
- if (agbno + len <= agbno)
- return false;
-
- if (!xfs_verify_agbno(pag, agbno))
- return false;
-
- return xfs_verify_agbno(pag, agbno + len - 1);
+ return xfs_verify_gbext(pag_group(pag), agbno, len);
}
/*
@@ -226,40 +263,6 @@ xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
}
-/*
- * Perag iteration APIs
- */
-static inline struct xfs_perag *
-xfs_perag_next(
- struct xfs_perag *pag,
- xfs_agnumber_t *agno,
- xfs_agnumber_t end_agno)
-{
- struct xfs_mount *mp = pag->pag_mount;
-
- *agno = pag->pag_agno + 1;
- xfs_perag_rele(pag);
- while (*agno <= end_agno) {
- pag = xfs_perag_grab(mp, *agno);
- if (pag)
- return pag;
- (*agno)++;
- }
- return NULL;
-}
-
-#define for_each_perag_range(mp, agno, end_agno, pag) \
- for ((pag) = xfs_perag_grab((mp), (agno)); \
- (pag) != NULL; \
- (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
-
-#define for_each_perag_from(mp, agno, pag) \
- for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
-
-#define for_each_perag(mp, agno, pag) \
- (agno) = 0; \
- for_each_perag_from((mp), (agno), (pag))
-
static inline struct xfs_perag *
xfs_perag_next_wrap(
struct xfs_perag *pag,
@@ -268,9 +271,9 @@ xfs_perag_next_wrap(
xfs_agnumber_t restart_agno,
xfs_agnumber_t wrap_agno)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
- *agno = pag->pag_agno + 1;
+ *agno = pag_agno(pag) + 1;
xfs_perag_rele(pag);
while (*agno != stop_agno) {
if (*agno >= wrap_agno) {
@@ -332,4 +335,28 @@ int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
xfs_extlen_t len);
int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+static inline xfs_fsblock_t
+xfs_agbno_to_fsb(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno)
+{
+ return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno);
+}
+
+static inline xfs_daddr_t
+xfs_agbno_to_daddr(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno)
+{
+ return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno);
+}
+
+static inline xfs_ino_t
+xfs_agino_to_ino(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino);
+}
+
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 216423df939e..fb79215a509d 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -70,6 +70,7 @@ xfs_ag_resv_critical(
struct xfs_perag *pag,
enum xfs_ag_resv_type type)
{
+ struct xfs_mount *mp = pag_mount(pag);
xfs_extlen_t avail;
xfs_extlen_t orig;
@@ -92,8 +93,8 @@ xfs_ag_resv_critical(
/* Critically low if less than 10% or max btree height remains. */
return XFS_TEST_ERROR(avail < orig / 10 ||
- avail < pag->pag_mount->m_agbtree_maxlevels,
- pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
+ avail < mp->m_agbtree_maxlevels,
+ mp, XFS_ERRTAG_AG_RESV_CRITICAL);
}
/*
@@ -113,6 +114,7 @@ xfs_ag_resv_needed(
case XFS_AG_RESV_RMAPBT:
len -= xfs_perag_resv(pag, type)->ar_reserved;
break;
+ case XFS_AG_RESV_METAFILE:
case XFS_AG_RESV_NONE:
/* empty */
break;
@@ -137,8 +139,8 @@ __xfs_ag_resv_free(
trace_xfs_ag_resv_free(pag, type, 0);
resv = xfs_perag_resv(pag, type);
- if (pag->pag_agno == 0)
- pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ if (pag_agno(pag) == 0)
+ pag_mount(pag)->m_ag_max_usable += resv->ar_asked;
/*
* RMAPBT blocks come from the AGFL and AGFL blocks are always
* considered "free", so whatever was reserved at mount time must be
@@ -148,7 +150,7 @@ __xfs_ag_resv_free(
oldresv = resv->ar_orig_reserved;
else
oldresv = resv->ar_reserved;
- xfs_add_fdblocks(pag->pag_mount, oldresv);
+ xfs_add_fdblocks(pag_mount(pag), oldresv);
resv->ar_reserved = 0;
resv->ar_asked = 0;
resv->ar_orig_reserved = 0;
@@ -170,7 +172,7 @@ __xfs_ag_resv_init(
xfs_extlen_t ask,
xfs_extlen_t used)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_ag_resv *resv;
int error;
xfs_extlen_t hidden_space;
@@ -206,11 +208,10 @@ __xfs_ag_resv_init(
else
error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
- trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
- error, _RET_IP_);
+ trace_xfs_ag_resv_init_error(pag, error, _RET_IP_);
xfs_warn(mp,
"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
- pag->pag_agno);
+ pag_agno(pag));
return error;
}
@@ -220,7 +221,7 @@ __xfs_ag_resv_init(
* counter, we only make the adjustment for AG 0. This assumes that
* there aren't any AGs hungrier for per-AG reservation than AG 0.
*/
- if (pag->pag_agno == 0)
+ if (pag_agno(pag) == 0)
mp->m_ag_max_usable -= ask;
resv = xfs_perag_resv(pag, type);
@@ -238,7 +239,7 @@ xfs_ag_resv_init(
struct xfs_perag *pag,
struct xfs_trans *tp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
xfs_extlen_t ask;
xfs_extlen_t used;
int error = 0, error2;
@@ -347,6 +348,7 @@ xfs_ag_resv_alloc_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_METAFILE:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
@@ -389,6 +391,7 @@ xfs_ag_resv_free_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_METAFILE:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 04f64cf9777e..7839efe050bf 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -33,8 +33,6 @@ struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
-#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
-
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
@@ -275,7 +273,7 @@ xfs_alloc_complain_bad_rec(
xfs_warn(mp,
"%sbt record corruption in AG %d detected at %pS!",
- cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_ops->name, cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"start block 0x%x block count 0x%x", irec->ar_startblock,
irec->ar_blockcount);
@@ -303,7 +301,7 @@ xfs_alloc_get_rec(
return error;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
@@ -331,7 +329,8 @@ xfs_alloc_compute_aligned(
bool busy;
/* Trim busy sections out of found extent */
- busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
+ busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen,
+ args->maxlen, &bno, &len, busy_gen);
/*
* If we have a largish extent that happens to start before min_agbno,
@@ -409,8 +408,8 @@ xfs_alloc_compute_diff(
if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
if (newlen1 < newlen2 ||
(newlen1 == newlen2 &&
- XFS_ABSDIFF(newbno1, wantbno) >
- XFS_ABSDIFF(newbno2, wantbno)))
+ abs_diff(newbno1, wantbno) >
+ abs_diff(newbno2, wantbno)))
newbno1 = newbno2;
} else if (newbno2 != NULLAGBLOCK)
newbno1 = newbno2;
@@ -426,7 +425,7 @@ xfs_alloc_compute_diff(
} else
newbno1 = freeend - wantlen;
*newbnop = newbno1;
- return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+ return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno);
}
/*
@@ -539,7 +538,7 @@ static int
xfs_alloc_fixup_longest(
struct xfs_btree_cur *cnt_cur)
{
- struct xfs_perag *pag = cnt_cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cnt_cur->bc_group);
struct xfs_buf *bp = cnt_cur->bc_ag.agbp;
struct xfs_agf *agf = bp->b_addr;
xfs_extlen_t longest = 0;
@@ -799,7 +798,7 @@ xfs_agfl_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag)))
return __this_address;
for (i = 0; i < xfs_agfl_size(mp); i++) {
@@ -879,13 +878,12 @@ xfs_alloc_read_agfl(
struct xfs_trans *tp,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *bp;
int error;
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
@@ -1252,14 +1250,14 @@ xfs_alloc_ag_vextent_small(
if (fbno == NULLAGBLOCK)
goto out;
- xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
+ xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1,
(args->datatype & XFS_ALLOC_NOBUSY));
if (args->datatype & XFS_ALLOC_USERDATA) {
struct xfs_buf *bp;
error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+ xfs_agbno_to_daddr(args->pag, fbno),
args->mp->m_bsize, 0, &bp);
if (error)
goto error;
@@ -1365,7 +1363,8 @@ xfs_alloc_ag_vextent_exact(
*/
tbno = fbno;
tlen = flen;
- xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
+ xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen,
+ &tbno, &tlen, &busy_gen);
/*
* Give up if the start of the extent is busy, or the freespace isn't
@@ -1758,8 +1757,9 @@ restart:
* the allocation can be retried.
*/
trace_xfs_alloc_near_busy(args);
- error = xfs_extent_busy_flush(args->tp, args->pag,
- acur.busy_gen, alloc_flags);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), acur.busy_gen,
+ alloc_flags);
if (error)
goto out;
@@ -1874,8 +1874,9 @@ restart:
* the allocation can be retried.
*/
trace_xfs_alloc_size_busy(args);
- error = xfs_extent_busy_flush(args->tp, args->pag,
- busy_gen, alloc_flags);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), busy_gen,
+ alloc_flags);
if (error)
goto error0;
@@ -1923,7 +1924,7 @@ restart:
error = -EFSCORRUPTED;
goto error0;
}
- if (flen < bestrlen)
+ if (flen <= bestrlen)
break;
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
@@ -1973,8 +1974,9 @@ restart:
* the allocation can be retried.
*/
trace_xfs_alloc_size_busy(args);
- error = xfs_extent_busy_flush(args->tp, args->pag,
- busy_gen, alloc_flags);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), busy_gen,
+ alloc_flags);
if (error)
goto error0;
@@ -2037,7 +2039,6 @@ int
xfs_free_ag_extent(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
@@ -2358,19 +2359,19 @@ xfs_free_ag_extent(
* Update the freespace totals in the ag and superblock.
*/
error = xfs_alloc_update_counters(tp, agbp, len);
- xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
+ xfs_ag_resv_free_extent(pag, type, tp, len);
if (error)
goto error0;
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
+ trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
+ trace_xfs_free_extent(pag, bno, len, type, -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -2429,7 +2430,7 @@ xfs_alloc_longest_free_extent(
* reservations and AGFL rules in place, we can return this extent.
*/
if (pag->pagf_longest > delta)
- return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+ return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable,
pag->pagf_longest - delta);
/* Otherwise, let the caller try for 1 block if there's space. */
@@ -2612,7 +2613,7 @@ xfs_agfl_reset(
xfs_warn(mp,
"WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
"Please unmount and run xfs_repair.",
- pag->pag_agno, pag->pagf_flcount);
+ pag_agno(pag), pag->pagf_flcount);
agf->agf_flfirst = 0;
agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
@@ -2645,8 +2646,17 @@ xfs_defer_extent_free(
ASSERT(!isnullstartblock(bno));
ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
- if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
- return -EFSCORRUPTED;
+ if (free_flags & XFS_FREE_EXTENT_REALTIME) {
+ if (type != XFS_AG_RESV_NONE) {
+ ASSERT(type == XFS_AG_RESV_NONE);
+ return -EFSCORRUPTED;
+ }
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+ } else {
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+ }
xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -2655,6 +2665,8 @@ xfs_defer_extent_free(
xefi->xefi_agresv = type;
if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (free_flags & XFS_FREE_EXTENT_REALTIME)
+ xefi->xefi_flags |= XFS_EFI_REALTIME;
if (oinfo) {
ASSERT(oinfo->oi_offset == 0);
@@ -2934,9 +2946,8 @@ xfs_alloc_fix_freelist(
* Deferring the free disconnects freeing up the AGFL slot from
* freeing the block.
*/
- error = xfs_free_extent_later(tp,
- XFS_AGB_TO_FSB(mp, args->agno, bno), 1,
- &targs.oinfo, XFS_AG_RESV_AGFL, 0);
+ error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno),
+ 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0);
if (error)
goto out_agbp_relse;
}
@@ -3156,8 +3167,6 @@ xfs_alloc_put_freelist(
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_alloc_log_agf(tp, agbp, logflags);
-
ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
@@ -3190,7 +3199,7 @@ xfs_validate_ag_length(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag && seqno != bp->b_pag->pag_agno)
+ if (bp->b_pag && seqno != pag_agno(bp->b_pag))
return __this_address;
/*
@@ -3359,13 +3368,13 @@ xfs_read_agf(
int flags,
struct xfs_buf **agfbpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
int error;
- trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
+ trace_xfs_read_agf(pag);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
@@ -3388,12 +3397,13 @@ xfs_alloc_read_agf(
int flags,
struct xfs_buf **agfbpp)
{
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *agfbp;
struct xfs_agf *agf;
int error;
int allocbt_blks;
- trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
+ trace_xfs_alloc_read_agf(pag);
/* We don't support trylock when freeing. */
ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
@@ -3414,7 +3424,7 @@ xfs_alloc_read_agf(
pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
- if (xfs_agfl_needs_reset(pag->pag_mount, agf))
+ if (xfs_agfl_needs_reset(mp, agf))
set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
else
clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
@@ -3427,16 +3437,15 @@ xfs_alloc_read_agf(
* counter only tracks non-root blocks.
*/
allocbt_blks = pag->pagf_btreeblks;
- if (xfs_has_rmapbt(pag->pag_mount))
+ if (xfs_has_rmapbt(mp))
allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
if (allocbt_blks > 0)
- atomic64_add(allocbt_blks,
- &pag->pag_mount->m_allocbt_blks);
+ atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
#ifdef DEBUG
- else if (!xfs_is_shutdown(pag->pag_mount)) {
+ else if (!xfs_is_shutdown(mp)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3597,7 +3606,7 @@ xfs_alloc_vextent_finish(
goto out_drop_perag;
}
- args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+ args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno);
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
@@ -3618,8 +3627,8 @@ xfs_alloc_vextent_finish(
if (error)
goto out_drop_perag;
- ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
- args->len));
+ ASSERT(!xfs_extent_busy_search(pag_group(args->pag),
+ args->agbno, args->len));
}
xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
@@ -3649,21 +3658,20 @@ xfs_alloc_vextent_this_ag(
struct xfs_alloc_arg *args,
xfs_agnumber_t agno)
{
- struct xfs_mount *mp = args->mp;
xfs_agnumber_t minimum_agno;
uint32_t alloc_flags = 0;
int error;
ASSERT(args->pag != NULL);
- ASSERT(args->pag->pag_agno == agno);
+ ASSERT(pag_agno(args->pag) == agno);
args->agno = agno;
args->agbno = 0;
trace_xfs_alloc_vextent_this_ag(args);
- error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
- &minimum_agno);
+ error = xfs_alloc_vextent_check_args(args,
+ xfs_agbno_to_fsb(args->pag, 0), &minimum_agno);
if (error) {
if (error == -ENOSPC)
return 0;
@@ -3868,7 +3876,7 @@ xfs_alloc_vextent_exact_bno(
int error;
ASSERT(args->pag != NULL);
- ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
+ ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
args->agno = XFS_FSB_TO_AGNO(mp, target);
args->agbno = XFS_FSB_TO_AGBNO(mp, target);
@@ -3907,7 +3915,7 @@ xfs_alloc_vextent_near_bno(
int error;
if (!needs_perag)
- ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
+ ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
args->agno = XFS_FSB_TO_AGNO(mp, target);
args->agbno = XFS_FSB_TO_AGBNO(mp, target);
@@ -3944,7 +3952,7 @@ xfs_free_extent_fix_freelist(
memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
- args.agno = pag->pag_agno;
+ args.agno = pag_agno(pag);
args.pag = pag;
/*
@@ -4012,14 +4020,13 @@ __xfs_free_extent(
goto err_release;
}
- error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
- type);
+ error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type);
if (error)
goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
- xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
+ xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags);
return 0;
err_release:
@@ -4044,7 +4051,7 @@ xfs_alloc_query_range_helper(
xfs_failaddr_t fa;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 0165452e7cd0..50ef79a1ed41 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -79,9 +79,8 @@ int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
struct xfs_buf *agfbp, struct xfs_buf *agflbp,
xfs_agblock_t bno, int btreeblk);
int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno, xfs_agblock_t bno,
- xfs_extlen_t len, const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type);
+ xfs_agblock_t bno, xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
/*
* Compute and fill in value of m_alloc_maxlevels.
@@ -238,7 +237,11 @@ int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
/* Don't issue a discard for the blocks freed. */
#define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0)
-#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD)
+/* Free blocks on the realtime device. */
+#define XFS_FREE_EXTENT_REALTIME (1U << 1)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD | \
+ XFS_FREE_EXTENT_REALTIME)
/*
* List of extents to be free "later".
@@ -249,7 +252,7 @@ struct xfs_extent_free_item {
uint64_t xefi_owner;
xfs_fsblock_t xefi_startblock;/* starting fs block number */
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
- struct xfs_perag *xefi_pag;
+ struct xfs_group *xefi_group;
unsigned int xefi_flags;
enum xfs_ag_resv_type xefi_agresv;
};
@@ -258,6 +261,12 @@ struct xfs_extent_free_item {
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */
+#define XFS_EFI_REALTIME (1U << 4) /* freeing realtime extent */
+
+static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi)
+{
+ return xefi->xefi_flags & XFS_EFI_REALTIME;
+}
struct xfs_alloc_autoreap {
struct xfs_defer_pending *dfp;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index aada676eee51..a4ac37ba5d51 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -28,7 +28,7 @@ xfs_bnobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
- cur->bc_ag.pag);
+ to_perag(cur->bc_group));
}
STATIC struct xfs_btree_cur *
@@ -36,29 +36,29 @@ xfs_cntbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
- cur->bc_ag.pag);
+ to_perag(cur->bc_group));
}
-
STATIC void
xfs_allocbt_set_root(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_ag.agbp;
- struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
ASSERT(ptr->s != 0);
if (xfs_btree_is_bno(cur->bc_ops)) {
agf->agf_bno_root = ptr->s;
be32_add_cpu(&agf->agf_bno_level, inc);
- cur->bc_ag.pag->pagf_bno_level += inc;
+ pag->pagf_bno_level += inc;
} else {
agf->agf_cnt_root = ptr->s;
be32_add_cpu(&agf->agf_cnt_level, inc);
- cur->bc_ag.pag->pagf_cnt_level += inc;
+ pag->pagf_cnt_level += inc;
}
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
@@ -75,7 +75,7 @@ xfs_allocbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+ error = xfs_alloc_get_freelist(to_perag(cur->bc_group), cur->bc_tp,
cur->bc_ag.agbp, &bno, 1);
if (error)
return error;
@@ -86,7 +86,7 @@ xfs_allocbt_alloc_block(
}
atomic64_inc(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_group, bno, 1, false);
new->s = cpu_to_be32(bno);
@@ -104,13 +104,13 @@ xfs_allocbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
- bno, 1);
+ error = xfs_alloc_put_freelist(to_perag(cur->bc_group), cur->bc_tp,
+ agbp, NULL, bno, 1);
if (error)
return error;
atomic64_dec(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, pag_group(agbp->b_pag), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
}
@@ -178,7 +178,7 @@ xfs_allocbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
if (xfs_btree_is_bno(cur->bc_ops))
ptr->s = agf->agf_bno_root;
@@ -492,7 +492,7 @@ xfs_bnobt_init_cursor(
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops,
mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agf *agf = agbp->b_addr;
@@ -518,7 +518,7 @@ xfs_cntbt_init_cursor(
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops,
mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agf *agf = agbp->b_addr;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index c63da14eee04..8c04acd30d48 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1004,7 +1004,8 @@ xfs_attr_add_fork(
unsigned int blks; /* space reservation */
int error; /* error return value */
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ if (!xfs_is_metadir_inode(ip))
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
blks = XFS_ADDAFORK_SPACE_RES(mp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 36dd08d13293..d954f9b8071f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,12 +34,13 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
-#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -170,18 +171,16 @@ xfs_bmbt_update(
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
-STATIC xfs_filblks_t
+xfs_filblks_t
xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
+ struct xfs_inode *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
+ struct xfs_mount *mp = ip->i_mount;
+ int maxrecs = mp->m_bmap_dmxr[0];
+ int level;
+ xfs_filblks_t rval;
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
@@ -614,7 +613,7 @@ xfs_bmap_btree_to_extents(
xfs_trans_binval(tp, cbp);
if (cur->bc_levels[0].bp == cbp)
cur->bc_levels[0].bp = NULL;
- xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_bmap_broot_realloc(ip, whichfork, 0);
ASSERT(ifp->if_broot == NULL);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -658,12 +657,11 @@ xfs_bmap_extents_to_btree(
* Make space in the inode incore. This needs to be undone if we fail
* to expand the root.
*/
- xfs_iroot_realloc(ip, 1, whichfork);
+ block = xfs_bmap_broot_realloc(ip, whichfork, 1);
/*
* Fill in the root.
*/
- block = ifp->if_broot;
xfs_bmbt_init_block(ip, block, NULL, 1, 1);
/*
* Need a cursor. Can't allocate until bb_level is filled in.
@@ -745,7 +743,7 @@ xfs_bmap_extents_to_btree(
out_unreserve_dquot:
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
out_root_realloc:
- xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_bmap_broot_realloc(ip, whichfork, 0);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
ASSERT(ifp->if_broot == NULL);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -1042,7 +1040,8 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ if (!xfs_is_metadir_inode(ip))
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
ASSERT(!xfs_inode_has_attr_fork(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1423,6 +1422,24 @@ xfs_bmap_last_offset(
* Extent tree manipulation functions used during allocation.
*/
+static inline bool
+xfs_bmap_same_rtgroup(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *left,
+ struct xfs_bmbt_irec *right)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) {
+ if (xfs_rtb_to_rgno(mp, left->br_startblock) !=
+ xfs_rtb_to_rgno(mp, right->br_startblock))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Convert a delayed allocation to a real allocation.
*/
@@ -1492,7 +1509,8 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new))
state |= BMAP_LEFT_CONTIG;
/*
@@ -1516,7 +1534,8 @@ xfs_bmap_add_extent_delay_real(
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= XFS_MAX_BMBT_EXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN) &&
+ xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2061,7 +2080,8 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new))
state |= BMAP_LEFT_CONTIG;
/*
@@ -2085,7 +2105,8 @@ xfs_bmap_add_extent_unwritten_real(
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= XFS_MAX_BMBT_EXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN) &&
+ xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2549,146 +2570,6 @@ done:
}
/*
- * Convert a hole to a delayed allocation.
- */
-STATIC void
-xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork,
- struct xfs_iext_cursor *icur,
- xfs_bmbt_irec_t *new) /* new data to add to file extents */
-{
- struct xfs_ifork *ifp; /* inode fork pointer */
- xfs_bmbt_irec_t left; /* left neighbor extent entry */
- xfs_filblks_t newlen=0; /* new indirect size */
- xfs_filblks_t oldlen=0; /* old indirect size */
- xfs_bmbt_irec_t right; /* right neighbor extent entry */
- uint32_t state = xfs_bmap_fork_to_state(whichfork);
- xfs_filblks_t temp; /* temp for indirect calculations */
-
- ifp = xfs_ifork_ptr(ip, whichfork);
- ASSERT(isnullstartblock(new->br_startblock));
-
- /*
- * Check and set flags if this segment has a left neighbor
- */
- if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
- state |= BMAP_LEFT_VALID;
- if (isnullstartblock(left.br_startblock))
- state |= BMAP_LEFT_DELAY;
- }
-
- /*
- * Check and set flags if the current (right) segment exists.
- * If it doesn't exist, we're converting the hole at end-of-file.
- */
- if (xfs_iext_get_extent(ifp, icur, &right)) {
- state |= BMAP_RIGHT_VALID;
- if (isnullstartblock(right.br_startblock))
- state |= BMAP_RIGHT_DELAY;
- }
-
- /*
- * Set contiguity flags on the left and right neighbors.
- * Don't let extents get too large, even if the pieces are contiguous.
- */
- if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
- left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
- state |= BMAP_LEFT_CONTIG;
-
- if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
- new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
- (!(state & BMAP_LEFT_CONTIG) ||
- (left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
- state |= BMAP_RIGHT_CONTIG;
-
- /*
- * Switch out based on the contiguity flags.
- */
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with delayed allocations
- * on the left and on the right.
- * Merge all three into a single extent record.
- */
- temp = left.br_blockcount + new->br_blockcount +
- right.br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_startblock = nullstartblock(newlen);
- left.br_blockcount = temp;
-
- xfs_iext_remove(ip, icur, state);
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_LEFT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the left.
- * Merge the new allocation with the left neighbor.
- */
- temp = left.br_blockcount + new->br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_blockcount = temp;
- left.br_startblock = nullstartblock(newlen);
-
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the right.
- * Merge the new allocation with the right neighbor.
- */
- temp = new->br_blockcount + right.br_blockcount;
- oldlen = startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- right.br_startoff = new->br_startoff;
- right.br_startblock = nullstartblock(newlen);
- right.br_blockcount = temp;
- xfs_iext_update_extent(ip, state, icur, &right);
- break;
-
- case 0:
- /*
- * New allocation is not contiguous with another
- * delayed allocation.
- * Insert a new entry.
- */
- oldlen = newlen = 0;
- xfs_iext_insert(ip, icur, new, state);
- break;
- }
- if (oldlen != newlen) {
- ASSERT(oldlen > newlen);
- xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
-
- /*
- * Nothing to do for disk quota accounting here.
- */
- xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
- }
-}
-
-/*
* Convert a hole to a real allocation.
*/
STATIC int /* error */
@@ -2745,7 +2626,8 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(ip, whichfork, &left, new))
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
@@ -2755,7 +2637,8 @@ xfs_bmap_add_extent_hole_real(
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) &&
+ xfs_bmap_same_rtgroup(ip, whichfork, new, &right))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -3121,8 +3004,15 @@ xfs_bmap_adjacent_valid(
struct xfs_mount *mp = ap->ip->i_mount;
if (XFS_IS_REALTIME_INODE(ap->ip) &&
- (ap->datatype & XFS_ALLOC_USERDATA))
- return x < mp->m_sb.sb_rblocks;
+ (ap->datatype & XFS_ALLOC_USERDATA)) {
+ if (!xfs_has_rtgroups(mp))
+ return x < mp->m_sb.sb_rblocks;
+
+ return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
+ xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
+ xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;
+
+ }
return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) &&
XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount &&
@@ -3280,7 +3170,7 @@ xfs_bmap_longest_free_extent(
}
longest = xfs_alloc_longest_free_extent(pag,
- xfs_alloc_min_freelist(pag->pag_mount, pag),
+ xfs_alloc_min_freelist(pag_mount(pag), pag),
xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
*blen = longest;
@@ -3422,6 +3312,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
+
+ /* Try to align start block to any minimum allocation alignment */
+ if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
+ args->alignment = align;
+
if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset,
@@ -3531,12 +3426,12 @@ xfs_bmap_btalloc_at_eof(
int error;
/*
- * If there are already extents in the file, try an exact EOF block
- * allocation to extend the file as a contiguous extent. If that fails,
- * or it's the first allocation in a file, just try for a stripe aligned
- * allocation.
+ * If there are already extents in the file, and xfs_bmap_adjacent() has
+ * given a better blkno, try an exact EOF block allocation to extend the
+ * file as a contiguous extent. If that fails, or it's the first
+ * allocation in a file, just try for a stripe aligned allocation.
*/
- if (ap->offset) {
+ if (ap->eof) {
xfs_extlen_t nextminlen = 0;
/*
@@ -3704,7 +3599,8 @@ xfs_bmap_btalloc_best_length(
int error;
ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino);
- xfs_bmap_adjacent(ap);
+ if (!xfs_bmap_adjacent(ap))
+ ap->eof = false;
/*
* Search for an allocation group with a single extent large enough for
@@ -4006,144 +3902,6 @@ xfs_bmapi_read(
return 0;
}
-/*
- * Add a delayed allocation extent to an inode. Blocks are reserved from the
- * global pool and the extent inserted into the inode in-core extent tree.
- *
- * On entry, got refers to the first extent beyond the offset of the extent to
- * allocate or eof is specified if no such extent exists. On return, got refers
- * to the extent record that was inserted to the inode fork.
- *
- * Note that the allocated extent may have been merged with contiguous extents
- * during insertion into the inode fork. Thus, got does not reflect the current
- * state of the inode fork on return. If necessary, the caller can use lastx to
- * look up the updated record in the inode fork.
- */
-int
-xfs_bmapi_reserve_delalloc(
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t off,
- xfs_filblks_t len,
- xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got,
- struct xfs_iext_cursor *icur,
- int eof)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
- xfs_extlen_t alen;
- xfs_extlen_t indlen;
- uint64_t fdblocks;
- int error;
- xfs_fileoff_t aoff;
- bool use_cowextszhint =
- whichfork == XFS_COW_FORK && !prealloc;
-
-retry:
- /*
- * Cap the alloc length. Keep track of prealloc so we know whether to
- * tag the inode before we return.
- */
- aoff = off;
- alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
- if (!eof)
- alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
- if (prealloc && alen >= len)
- prealloc = alen - len;
-
- /*
- * If we're targetting the COW fork but aren't creating a speculative
- * posteof preallocation, try to expand the reservation to align with
- * the COW extent size hint if there's sufficient free space.
- *
- * Unlike the data fork, the CoW cancellation functions will free all
- * the reservations at inactivation, so we don't require that every
- * delalloc reservation have a dirty pagecache.
- */
- if (use_cowextszhint) {
- struct xfs_bmbt_irec prev;
- xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
-
- if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
- prev.br_startoff = NULLFILEOFF;
-
- error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
- 1, 0, &aoff, &alen);
- ASSERT(!error);
- }
-
- /*
- * Make a transaction-less quota reservation for delayed allocation
- * blocks. This number gets adjusted later. We return if we haven't
- * allocated blocks already inside this loop.
- */
- error = xfs_quota_reserve_blkres(ip, alen);
- if (error)
- goto out;
-
- /*
- * Split changing sb for alen and indlen since they could be coming
- * from different places.
- */
- indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
- ASSERT(indlen > 0);
-
- fdblocks = indlen;
- if (XFS_IS_REALTIME_INODE(ip)) {
- error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen));
- if (error)
- goto out_unreserve_quota;
- } else {
- fdblocks += alen;
- }
-
- error = xfs_dec_fdblocks(mp, fdblocks, false);
- if (error)
- goto out_unreserve_frextents;
-
- ip->i_delayed_blks += alen;
- xfs_mod_delalloc(ip, alen, indlen);
-
- got->br_startoff = aoff;
- got->br_startblock = nullstartblock(indlen);
- got->br_blockcount = alen;
- got->br_state = XFS_EXT_NORM;
-
- xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
-
- /*
- * Tag the inode if blocks were preallocated. Note that COW fork
- * preallocation can occur at the start or end of the extent, even when
- * prealloc == 0, so we must also check the aligned offset and length.
- */
- if (whichfork == XFS_DATA_FORK && prealloc)
- xfs_inode_set_eofblocks_tag(ip);
- if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
- xfs_inode_set_cowblocks_tag(ip);
-
- return 0;
-
-out_unreserve_frextents:
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen));
-out_unreserve_quota:
- if (XFS_IS_QUOTA_ON(mp))
- xfs_quota_unreserve_blkres(ip, alen);
-out:
- if (error == -ENOSPC || error == -EDQUOT) {
- trace_xfs_delalloc_enospc(ip, off, len);
-
- if (prealloc || use_cowextszhint) {
- /* retry without any preallocation */
- use_cowextszhint = false;
- prealloc = 0;
- goto retry;
- }
- }
- return error;
-}
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
@@ -4532,8 +4290,9 @@ xfs_bmapi_write(
* the refcount btree for orphan recovery.
*/
if (whichfork == XFS_COW_FORK)
- xfs_refcount_alloc_cow_extent(tp, bma.blkno,
- bma.length);
+ xfs_refcount_alloc_cow_extent(tp,
+ XFS_IS_REALTIME_INODE(ip),
+ bma.blkno, bma.length);
}
/* Deal with the allocated space we found. */
@@ -4708,7 +4467,8 @@ xfs_bmapi_convert_one_delalloc(
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
- xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+ xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip),
+ bma.blkno, bma.length);
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -4913,7 +4673,8 @@ xfs_bmap_del_extent_delay(
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del)
+ struct xfs_bmbt_irec *del,
+ uint32_t bflags) /* bmapi flags */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -5033,10 +4794,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (isrt)
- xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
- else
+ if (bflags & XFS_BMAPI_REMAP) {
+ ;
+ } else if (isrt) {
+ xfs_rtbxlen_t rtxlen;
+
+ rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_add_available(mp, rtxlen);
+ xfs_add_frextents(mp, rtxlen);
+ } else {
fdblocks += del->br_blockcount;
+ }
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -5113,6 +4882,34 @@ xfs_bmap_del_extent_cow(
ip->i_delayed_blks -= del->br_blockcount;
}
+static int
+xfs_bmap_free_rtblocks(
+ struct xfs_trans *tp,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ rtg = xfs_rtgroup_grab(tp->t_mountp, 0);
+ if (!rtg)
+ return -EIO;
+
+ /*
+ * Ensure the bitmap and summary inodes are locked and joined to the
+ * transaction before modifying them.
+ */
+ if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
+ tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP);
+ }
+
+ error = xfs_rtfree_blocks(tp, rtg, del->br_startblock,
+ del->br_blockcount);
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
/*
* Called by xfs_bmapi to update file extent records and the btree
* after removing space.
@@ -5325,20 +5122,12 @@ xfs_bmap_del_extent_real(
* If we need to, add to list of extents to delete.
*/
if (!(bflags & XFS_BMAPI_REMAP)) {
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
+
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
- xfs_refcount_decrease_extent(tp, del);
- } else if (xfs_ifork_is_realtime(ip, whichfork)) {
- /*
- * Ensure the bitmap and summary inodes are locked
- * and joined to the transaction before modifying them.
- */
- if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
- tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
- xfs_rtbitmap_lock(mp);
- xfs_rtbitmap_trans_join(tp);
- }
- error = xfs_rtfree_blocks(tp, del->br_startblock,
- del->br_blockcount);
+ xfs_refcount_decrease_extent(tp, isrt, del);
+ } else if (isrt && !xfs_has_rtgroups(mp)) {
+ error = xfs_bmap_free_rtblocks(tp, del);
} else {
unsigned int efi_flags = 0;
@@ -5346,6 +5135,19 @@ xfs_bmap_del_extent_real(
del->br_state == XFS_EXT_UNWRITTEN)
efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+ /*
+ * Historically, we did not use EFIs to free realtime
+ * extents. However, when reverse mapping is enabled,
+ * we must maintain the same order of operations as the
+ * data device, which is: Remove the file mapping,
+ * remove the reverse mapping, and then free the
+ * blocks. Reflink for realtime volumes requires the
+ * same sort of ordering. Both features rely on
+ * rtgroups, so let's gate rt EFI usage on rtgroups.
+ */
+ if (isrt)
+ efi_flags |= XFS_FREE_EXTENT_REALTIME;
+
error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE, efi_flags);
@@ -5602,7 +5404,8 @@ __xfs_bunmapi(
delete:
if (wasdel) {
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, flags);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,
@@ -5694,6 +5497,8 @@ xfs_bunmapi(
*/
STATIC bool
xfs_bmse_can_merge(
+ struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *left, /* preceding extent */
struct xfs_bmbt_irec *got, /* current extent to shift */
xfs_fileoff_t shift) /* shift fsb */
@@ -5709,7 +5514,8 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) ||
+ !xfs_bmap_same_rtgroup(ip, whichfork, left, got))
return false;
return true;
@@ -5745,7 +5551,7 @@ xfs_bmse_merge(
blockcount = left->br_blockcount + got->br_blockcount;
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
- ASSERT(xfs_bmse_can_merge(left, got, shift));
+ ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift));
new = *left;
new.br_blockcount = blockcount;
@@ -5907,7 +5713,8 @@ xfs_bmap_collapse_extents(
goto del_cursor;
}
- if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) {
+ if (xfs_bmse_can_merge(ip, whichfork, &prev, &got,
+ offset_shift_fsb)) {
error = xfs_bmse_merge(tp, ip, whichfork,
offset_shift_fsb, &icur, &got, &prev,
cur, &logflags);
@@ -6043,7 +5850,8 @@ xfs_bmap_insert_extents(
* never find mergeable extents in this scenario. Check anyways
* and warn if we encounter two extents that could be one.
*/
- if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb))
+ if (xfs_bmse_can_merge(ip, whichfork, &got, &next,
+ offset_shift_fsb))
WARN_ON_ONCE(1);
}
@@ -6428,9 +6236,8 @@ xfs_get_extsz_hint(
* No point in aligning allocations if we need to COW to actually
* write to them.
*/
- if (xfs_is_always_cow_inode(ip))
- return 0;
- if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+ if (!xfs_is_always_cow_inode(ip) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
return ip->i_extsize;
if (XFS_IS_REALTIME_INODE(ip) &&
ip->i_mount->m_sb.sb_rextsize > 1)
@@ -6453,7 +6260,13 @@ xfs_get_cowextsz_hint(
a = 0;
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
a = ip->i_cowextsize;
- b = xfs_get_extsz_hint(ip);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ b = 0;
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+ b = ip->i_extsize;
+ } else {
+ b = xfs_get_extsz_hint(ip);
+ }
a = max(a, b);
if (a == 0)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 7592d46e97c6..d5f2729305fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10)
+/* Try to align allocations to the extent size hint */
+#define XFS_BMAPI_EXTSZALIGN (1u << 11)
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
- { XFS_BMAPI_NORMAP, "NORMAP" }
+ { XFS_BMAPI_NORMAP, "NORMAP" },\
+ { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w)
@@ -204,7 +208,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, int *done);
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del);
+ struct xfs_bmbt_irec *del, uint32_t bflags);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
@@ -219,10 +223,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
bool *done, xfs_fileoff_t stop_fsb);
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t split_offset);
-int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
- int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@@ -233,6 +233,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
@@ -248,7 +249,7 @@ struct xfs_bmap_intent {
enum xfs_bmap_intent_type bi_type;
int bi_whichfork;
struct xfs_inode *bi_owner;
- struct xfs_perag *bi_pag;
+ struct xfs_group *bi_group;
struct xfs_bmbt_irec bi_bmap;
};
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 3464be771f95..908d7b050e9c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -516,6 +516,116 @@ xfs_bmbt_keys_contiguous(
be64_to_cpu(key2->bmbt.br_startoff));
}
+static inline void
+xfs_bmbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records. Move the
+ * records and pointers in if_broot to fit the new size. When shrinking this
+ * will eliminate holes between the records and pointers created by the caller.
+ * When growing this will create holes to be filled in by the caller.
+ *
+ * The caller must not request to add more records than would fit in the
+ * on-disk inode root. If the if_broot is currently NULL, then if we are
+ * adding records, one will be allocated. The caller must also not request
+ * that the number of records go below zero, although it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * whichfork -- which inode fork to change
+ * new_numrecs -- the new number of records requested for the if_broot array
+ *
+ * Returns the incore btree root block.
+ */
+struct xfs_btree_block *
+xfs_bmap_broot_realloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+
+ /*
+ * Block mapping btrees do not support storing zero records; if this
+ * happens, the fork is being changed to FMT_EXTENTS. Free the broot
+ * and get out.
+ */
+ if (new_numrecs == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ new_size = xfs_bmap_broot_space_calc(mp, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just
+ * allocate it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need
+ * to realloc() it and shift the pointers to their new
+ * location. The records don't change location because
+ * they are kept butted up against the btree block header.
+ */
+ old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false);
+ broot = xfs_broot_realloc(ifp, new_size);
+ ASSERT(xfs_bmap_bmdr_space(broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs);
+ return broot;
+ }
+
+ /*
+ * We're reducing, but not totally eliminating, numrecs. In this case,
+ * we are shrinking the if_broot buffer, so it must already exist.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0);
+
+ /*
+ * Shrink the btree root by moving the bmbt pointers, since they are
+ * not butted up against the btree block header, then reallocating
+ * broot.
+ */
+ xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+ ASSERT(xfs_bmap_bmdr_space(broot) <=
+ xfs_inode_fork_size(ip, whichfork));
+ return broot;
+}
+
+static struct xfs_btree_block *
+xfs_bmbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork,
+ new_numrecs);
+}
+
const struct xfs_btree_ops xfs_bmbt_ops = {
.name = "bmap",
.type = XFS_BTREE_TYPE_INODE,
@@ -543,6 +653,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
.keys_contiguous = xfs_bmbt_keys_contiguous,
+ .broot_realloc = xfs_bmbt_broot_realloc,
};
/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 49a3bae3f6ec..b238d559ab03 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -198,4 +198,7 @@ xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
}
+struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip,
+ int whichfork, unsigned int new_numrecs);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a5c4af148853..299ce7fd11b0 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -30,6 +30,12 @@
#include "xfs_health.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_quota.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Btree magic numbers.
@@ -225,7 +231,7 @@ __xfs_btree_check_agblock(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
xfs_failaddr_t fa;
xfs_agblock_t agbno;
@@ -331,7 +337,7 @@ __xfs_btree_check_ptr(
return -EFSCORRUPTED;
break;
case XFS_BTREE_TYPE_AG:
- if (!xfs_verify_agbno(cur->bc_ag.pag,
+ if (!xfs_verify_agbno(to_perag(cur->bc_group),
be32_to_cpu((&ptr->s)[index])))
return -EFSCORRUPTED;
break;
@@ -372,7 +378,7 @@ xfs_btree_check_ptr(
case XFS_BTREE_TYPE_AG:
xfs_err(cur->bc_mp,
"AG %u: Corrupt %sbt pointer at level %d index %d.",
- cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
+ cur->bc_group->xg_gno, cur->bc_ops->name,
level, index);
break;
}
@@ -523,20 +529,8 @@ xfs_btree_del_cursor(
ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
xfs_is_shutdown(cur->bc_mp) || error != 0);
- switch (cur->bc_ops->type) {
- case XFS_BTREE_TYPE_AG:
- if (cur->bc_ag.pag)
- xfs_perag_put(cur->bc_ag.pag);
- break;
- case XFS_BTREE_TYPE_INODE:
- /* nothing to do */
- break;
- case XFS_BTREE_TYPE_MEM:
- if (cur->bc_mem.pag)
- xfs_perag_put(cur->bc_mem.pag);
- break;
- }
-
+ if (cur->bc_group)
+ xfs_group_put(cur->bc_group);
kmem_cache_free(cur->bc_cache, cur);
}
@@ -1017,22 +1011,22 @@ xfs_btree_readahead_agblock(
struct xfs_btree_block *block)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
int rval = 0;
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
xfs_buf_readahead(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, left),
- mp->m_bsize, cur->bc_ops->buf_ops);
+ xfs_agbno_to_daddr(pag, left), mp->m_bsize,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
xfs_buf_readahead(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, right),
- mp->m_bsize, cur->bc_ops->buf_ops);
+ xfs_agbno_to_daddr(pag, right), mp->m_bsize,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -1091,7 +1085,7 @@ xfs_btree_ptr_to_daddr(
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_AG:
- *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group),
be32_to_cpu(ptr->s));
break;
case XFS_BTREE_TYPE_INODE:
@@ -1313,7 +1307,7 @@ xfs_btree_owner(
case XFS_BTREE_TYPE_INODE:
return cur->bc_ino.ip->i_ino;
case XFS_BTREE_TYPE_AG:
- return cur->bc_ag.pag->pag_agno;
+ return cur->bc_group->xg_gno;
default:
ASSERT(0);
return 0;
@@ -1549,12 +1543,16 @@ xfs_btree_log_recs(
int first,
int last)
{
+ if (!bp) {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
+ return;
+ }
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_rec_offset(cur, first),
xfs_btree_rec_offset(cur, last + 1) - 1);
-
}
/*
@@ -3090,6 +3088,131 @@ xfs_btree_split(
#define xfs_btree_split __xfs_btree_split
#endif /* __KERNEL__ */
+/* Move the records from a root leaf block to a separate block. */
+STATIC void
+xfs_btree_promote_leaf_iroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ struct xfs_buf *cbp,
+ union xfs_btree_ptr *cptr,
+ struct xfs_btree_block *cblock)
+{
+ union xfs_btree_rec *rp;
+ union xfs_btree_rec *crp;
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *pp;
+ struct xfs_btree_block *broot;
+ int numrecs = xfs_btree_get_numrecs(block);
+
+ /* Copy the records from the leaf broot into the new child block. */
+ rp = xfs_btree_rec_addr(cur, 1, block);
+ crp = xfs_btree_rec_addr(cur, 1, cblock);
+ xfs_btree_copy_recs(cur, crp, rp, numrecs);
+
+ /*
+ * Increment the tree height.
+ *
+ * Trickery here: The amount of memory that we need per record for the
+ * ifork's btree root block may change when we convert the broot from a
+ * leaf to a node block. Free the existing leaf broot so that nobody
+ * thinks we need to migrate node pointers when we realloc the broot
+ * buffer after bumping nlevels.
+ */
+ cur->bc_ops->broot_realloc(cur, 0);
+ cur->bc_nlevels++;
+ cur->bc_levels[1].ptr = 1;
+
+ /*
+ * Allocate a new node broot and initialize it to point to the new
+ * child block.
+ */
+ broot = cur->bc_ops->broot_realloc(cur, 1);
+ xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops,
+ cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino);
+
+ pp = xfs_btree_ptr_addr(cur, 1, broot);
+ kp = xfs_btree_key_addr(cur, 1, broot);
+ xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+ xfs_btree_get_keys(cur, cblock, kp);
+
+ /* Attach the new block to the cursor and log it. */
+ xfs_btree_setbuf(cur, 0, cbp);
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, cbp, 1, numrecs);
+}
+
+/*
+ * Move the keys and pointers from a root block to a separate block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_promote_node_iroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *cbp,
+ union xfs_btree_ptr *cptr,
+ struct xfs_btree_block *cblock)
+{
+ union xfs_btree_key *ckp;
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *cpp;
+ union xfs_btree_ptr *pp;
+ int i;
+ int error;
+ int numrecs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Increase tree height, adjusting the root block level to match.
+ * We cannot change the root btree node size until we've copied the
+ * block contents to the new child block.
+ */
+ be16_add_cpu(&block->bb_level, 1);
+ cur->bc_nlevels++;
+ cur->bc_levels[level + 1].ptr = 1;
+
+ /*
+ * Adjust the root btree record count, then copy the keys from the old
+ * root to the new child block.
+ */
+ xfs_btree_set_numrecs(block, 1);
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, ckp, kp, numrecs);
+
+ /* Check the pointers and copy them to the new child block. */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+ for (i = 0; i < numrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (error)
+ return error;
+ }
+ xfs_btree_copy_ptrs(cur, cpp, pp, numrecs);
+
+ /*
+ * Set the first keyptr to point to the new child block, then shrink
+ * the memory buffer for the root block.
+ */
+ error = xfs_btree_debug_check_ptr(cur, cptr, 0, level);
+ if (error)
+ return error;
+ xfs_btree_copy_ptrs(cur, pp, cptr, 1);
+ xfs_btree_get_keys(cur, cblock, kp);
+
+ cur->bc_ops->broot_realloc(cur, 1);
+
+ /* Attach the new block to the cursor and log it. */
+ xfs_btree_setbuf(cur, level, cbp);
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_keys(cur, cbp, 1, numrecs);
+ xfs_btree_log_ptrs(cur, cbp, 1, numrecs);
+ return 0;
+}
+
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -3103,14 +3226,10 @@ xfs_btree_new_iroot(
struct xfs_buf *cbp; /* buffer for cblock */
struct xfs_btree_block *block; /* btree block */
struct xfs_btree_block *cblock; /* child btree block */
- union xfs_btree_key *ckp; /* child key pointer */
- union xfs_btree_ptr *cpp; /* child ptr pointer */
- union xfs_btree_key *kp; /* pointer to btree key */
- union xfs_btree_ptr *pp; /* pointer to block addr */
+ union xfs_btree_ptr aptr;
union xfs_btree_ptr nptr; /* new block addr */
int level; /* btree level */
int error; /* error return code */
- int i; /* loop counter */
XFS_BTREE_STATS_INC(cur, newroot);
@@ -3119,10 +3238,15 @@ xfs_btree_new_iroot(
level = cur->bc_nlevels - 1;
block = xfs_btree_get_iroot(cur);
- pp = xfs_btree_ptr_addr(cur, 1, block);
+ ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS));
+ if (level > 0)
+ aptr = *xfs_btree_ptr_addr(cur, 1, block);
+ else
+ aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp,
+ cur->bc_ino.ip->i_ino));
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
+ error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3148,47 +3272,16 @@ xfs_btree_new_iroot(
cblock->bb_u.s.bb_blkno = bno;
}
- be16_add_cpu(&block->bb_level, 1);
- xfs_btree_set_numrecs(block, 1);
- cur->bc_nlevels++;
- ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
- cur->bc_levels[level + 1].ptr = 1;
-
- kp = xfs_btree_key_addr(cur, 1, block);
- ckp = xfs_btree_key_addr(cur, 1, cblock);
- xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
- cpp = xfs_btree_ptr_addr(cur, 1, cblock);
- for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
- error = xfs_btree_debug_check_ptr(cur, pp, i, level);
+ if (level > 0) {
+ error = xfs_btree_promote_node_iroot(cur, block, level, cbp,
+ &nptr, cblock);
if (error)
goto error0;
+ } else {
+ xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock);
}
- xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
- error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
- if (error)
- goto error0;
-
- xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
- xfs_iroot_realloc(cur->bc_ino.ip,
- 1 - xfs_btree_get_numrecs(cblock),
- cur->bc_ino.whichfork);
-
- xfs_btree_setbuf(cur, level, cbp);
-
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
- xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
- xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
- *logflags |=
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
+ *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
*stat = 1;
return 0;
error0:
@@ -3359,7 +3452,7 @@ xfs_btree_make_block_unfull(
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
- xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
+ cur->bc_ops->broot_realloc(cur, numrecs + 1);
*stat = 1;
} else {
/* A root block that needs replacing */
@@ -3569,14 +3662,31 @@ xfs_btree_insrec(
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
/*
- * If we just inserted into a new tree block, we have to
- * recalculate nkey here because nkey is out of date.
+ * Update btree keys to reflect the newly added record or keyptr.
+ * There are three cases here to be aware of. Normally, all we have to
+ * do is walk towards the root, updating keys as necessary.
+ *
+ * If the caller had us target a full block for the insertion, we dealt
+ * with that by calling the _make_block_unfull function. If the
+ * "make unfull" function splits the block, it'll hand us back the key
+ * and pointer of the new block. We haven't yet added the new block to
+ * the next level up, so if we decide to add the new record to the new
+ * block (bp->b_bn != old_bn), we have to update the caller's pointer
+ * so that the caller adds the new block with the correct key.
*
- * Otherwise we're just updating an existing block (having shoved
- * some records into the new tree block), so use the regular key
- * update mechanism.
+ * However, there is a third possibility-- if the selected block is the
+ * root block of an inode-rooted btree and cannot be expanded further,
+ * the "make unfull" function moves the root block contents to a new
+ * block and updates the root block to point to the new block. In this
+ * case, no block pointer is passed back because the block has already
+ * been added to the btree. In this case, we need to use the regular
+ * key update function, just like the first case. This is critical for
+ * overlapping btrees, because the high key must be updated to reflect
+ * the entire tree, not just the subtree accessible through the first
+ * child of the root (which is now two levels down from the root).
*/
- if (bp && xfs_buf_daddr(bp) != old_bn) {
+ if (!xfs_btree_ptr_is_null(cur, &nptr) &&
+ bp && xfs_buf_daddr(bp) != old_bn) {
xfs_btree_get_keys(cur, block, lkey);
} else if (xfs_btree_needs_key_update(cur, optr)) {
error = xfs_btree_update_keys(cur, level);
@@ -3688,6 +3798,97 @@ error0:
return error;
}
+/* Move the records from a child leaf block to the root block. */
+STATIC void
+xfs_btree_demote_leaf_child(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *cblock,
+ int numrecs)
+{
+ union xfs_btree_rec *rp;
+ union xfs_btree_rec *crp;
+ struct xfs_btree_block *broot;
+
+ /*
+ * Decrease the tree height.
+ *
+ * Trickery here: The amount of memory that we need per record for the
+ * ifork's btree root block may change when we convert the broot from a
+ * node to a leaf. Free the old node broot so that we can get a fresh
+ * leaf broot.
+ */
+ cur->bc_ops->broot_realloc(cur, 0);
+ cur->bc_nlevels--;
+
+ /*
+ * Allocate a new leaf broot and copy the records from the old child.
+ * Detach the old child from the cursor.
+ */
+ broot = cur->bc_ops->broot_realloc(cur, numrecs);
+ xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs,
+ cur->bc_ino.ip->i_ino);
+
+ rp = xfs_btree_rec_addr(cur, 1, broot);
+ crp = xfs_btree_rec_addr(cur, 1, cblock);
+ xfs_btree_copy_recs(cur, rp, crp, numrecs);
+
+ cur->bc_levels[0].bp = NULL;
+}
+
+/*
+ * Move the keyptrs from a child node block to the root block.
+ *
+ * Since the keyptr size does not change, all we have to do is increase the
+ * tree height, copy the keyptrs to the new internal node (cblock), shrink
+ * the root, and copy the pointers there.
+ */
+STATIC int
+xfs_btree_demote_node_child(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *cblock,
+ int level,
+ int numrecs)
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_key *ckp;
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *cpp;
+ union xfs_btree_ptr *pp;
+ int i;
+ int error;
+
+ /*
+ * Adjust the root btree node size and the record count to match the
+ * doomed child so that we can copy the keyptrs ahead of changing the
+ * tree shape.
+ */
+ block = cur->bc_ops->broot_realloc(cur, numrecs);
+
+ xfs_btree_set_numrecs(block, numrecs);
+ ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+ /* Copy keys from the doomed block. */
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+ /* Copy pointers from the doomed block. */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+ for (i = 0; i < numrecs; i++) {
+ error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+ if (error)
+ return error;
+ }
+ xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+ /* Decrease tree height, adjusting the root block level to match. */
+ cur->bc_levels[level - 1].bp = NULL;
+ be16_add_cpu(&block->bb_level, -1);
+ cur->bc_nlevels--;
+ return 0;
+}
+
/*
* Try to merge a non-leaf block back into the inode root.
*
@@ -3700,34 +3901,31 @@ STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
- int whichfork = cur->bc_ino.whichfork;
struct xfs_inode *ip = cur->bc_ino.ip;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
- union xfs_btree_key *kp;
- union xfs_btree_key *ckp;
- union xfs_btree_ptr *pp;
- union xfs_btree_ptr *cpp;
struct xfs_buf *cbp;
int level;
- int index;
int numrecs;
int error;
#ifdef DEBUG
union xfs_btree_ptr ptr;
#endif
- int i;
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
- ASSERT(cur->bc_nlevels > 1);
+ ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) ||
+ cur->bc_nlevels > 1);
/*
* Don't deal with the root block needs to be a leaf case.
* We're just going to turn the thing back into extents anyway.
*/
level = cur->bc_nlevels - 1;
- if (level == 1)
+ if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS))
+ goto out0;
+
+ /* If we're already a leaf, jump out. */
+ if (level == 0)
goto out0;
/*
@@ -3757,40 +3955,20 @@ xfs_btree_kill_iroot(
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
#endif
- index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
- if (index) {
- xfs_iroot_realloc(cur->bc_ino.ip, index,
- cur->bc_ino.whichfork);
- block = ifp->if_broot;
- }
-
- be16_add_cpu(&block->bb_numrecs, index);
- ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
- kp = xfs_btree_key_addr(cur, 1, block);
- ckp = xfs_btree_key_addr(cur, 1, cblock);
- xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
- pp = xfs_btree_ptr_addr(cur, 1, block);
- cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-
- for (i = 0; i < numrecs; i++) {
- error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
+ if (level > 1) {
+ error = xfs_btree_demote_node_child(cur, cblock, level,
+ numrecs);
if (error)
return error;
- }
-
- xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+ } else
+ xfs_btree_demote_leaf_child(cur, cblock, numrecs);
error = xfs_btree_free_block(cur, cbp);
if (error)
return error;
- cur->bc_levels[level - 1].bp = NULL;
- be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
- cur->bc_nlevels--;
out0:
return 0;
}
@@ -3944,10 +4122,10 @@ xfs_btree_delrec(
/*
* We're at the root level. First, shrink the root block in-memory.
* Try to get rid of the next level down. If we can't then there's
- * nothing left to do.
+ * nothing left to do. numrecs was decremented above.
*/
if (xfs_btree_at_iroot(cur, level)) {
- xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
+ cur->bc_ops->broot_realloc(cur, numrecs);
error = xfs_btree_kill_iroot(cur);
if (error)
@@ -4745,7 +4923,7 @@ xfs_btree_agblock_v5hdr_verify(
return __this_address;
if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
- if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag))
return __this_address;
return NULL;
}
@@ -5156,7 +5334,7 @@ xfs_btree_count_blocks_helper(
int level,
void *data)
{
- xfs_extlen_t *blocks = data;
+ xfs_filblks_t *blocks = data;
(*blocks)++;
return 0;
@@ -5166,7 +5344,7 @@ xfs_btree_count_blocks_helper(
int
xfs_btree_count_blocks(
struct xfs_btree_cur *cur,
- xfs_extlen_t *blocks)
+ xfs_filblks_t *blocks)
{
*blocks = 0;
return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
@@ -5355,6 +5533,12 @@ xfs_btree_init_cur_caches(void)
error = xfs_refcountbt_init_cur_cache();
if (error)
goto err;
+ error = xfs_rtrmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rtrefcountbt_init_cur_cache();
+ if (error)
+ goto err;
return 0;
err:
@@ -5371,6 +5555,8 @@ xfs_btree_destroy_cur_caches(void)
xfs_bmbt_destroy_cur_cache();
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
+ xfs_rtrmapbt_destroy_cur_cache();
+ xfs_rtrefcountbt_destroy_cur_cache();
}
/* Move the btree cursor before the first record. */
@@ -5399,3 +5585,67 @@ xfs_btree_goto_left_edge(
return 0;
}
+
+/* Allocate a block for an inode-rooted metadata btree. */
+int
+xfs_btree_alloc_metafile_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfs_alloc_arg args = {
+ .mp = cur->bc_mp,
+ .tp = cur->bc_tp,
+ .resv = XFS_AG_RESV_METAFILE,
+ .minlen = 1,
+ .maxlen = 1,
+ .prod = 1,
+ };
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ int error;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork);
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino));
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK) {
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+
+ xfs_metafile_resv_alloc_space(ip, &args);
+
+ new->l = cpu_to_be64(args.fsbno);
+ *stat = 1;
+ return 0;
+}
+
+/* Free a block from an inode-rooted metadata btree. */
+int
+xfs_btree_free_metafile_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_trans *tp = cur->bc_tp;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ int error;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+ error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE,
+ 0);
+ if (error)
+ return error;
+
+ xfs_metafile_resv_free_space(ip, tp, 1);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 10b7ddc3b2b3..355b304696e6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -135,7 +135,7 @@ struct xfs_btree_ops {
/* offset of btree stats array */
unsigned int statoff;
- /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+ /* sick mask for health reporting (not for bmap btrees) */
unsigned int sick_mask;
/* cursor operations */
@@ -213,11 +213,27 @@ struct xfs_btree_ops {
const union xfs_btree_key *key1,
const union xfs_btree_key *key2,
const union xfs_btree_key *mask);
+
+ /*
+ * Reallocate the space for if_broot to fit the number of records.
+ * Move the records and pointers in if_broot to fit the new size. When
+ * shrinking this will eliminate holes between the records and pointers
+ * created by the caller. When growing this will create holes to be
+ * filled in by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root. If the if_broot is currently NULL, then if
+ * we are adding records, one will be allocated. The caller must also
+ * not request that the number of records go below zero, although it
+ * can go to zero.
+ */
+ struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur,
+ unsigned int new_numrecs);
};
/* btree geometry flags */
#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */
-
+#define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */
union xfs_btree_irec {
struct xfs_alloc_rec_incore a;
@@ -254,6 +270,7 @@ struct xfs_btree_cur
union xfs_btree_irec bc_rec; /* current insert/search record value */
uint8_t bc_nlevels; /* number of levels in the tree */
uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ struct xfs_group *bc_group;
/* per-type information */
union {
@@ -264,13 +281,11 @@ struct xfs_btree_cur
struct xbtree_ifakeroot *ifake; /* for staging cursor */
} bc_ino;
struct {
- struct xfs_perag *pag;
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
} bc_ag;
struct {
struct xfbtree *xfbtree;
- struct xfs_perag *pag;
} bc_mem;
};
@@ -282,7 +297,7 @@ struct xfs_btree_cur
struct {
unsigned int nr_ops; /* # record updates */
unsigned int shape_changes; /* # of extent splits */
- } bc_refc; /* refcountbt */
+ } bc_refc; /* refcountbt/rtrefcountbt */
};
/* Must be at the end of the struct! */
@@ -485,7 +500,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
-int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks);
union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
@@ -688,4 +703,10 @@ xfs_btree_at_iroot(
level == cur->bc_nlevels - 1;
}
+int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start, union xfs_btree_ptr *newp,
+ int *stat);
+int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur,
+ struct xfs_buf *bp);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
index 036061fe32cc..f2f7b4305413 100644
--- a/fs/xfs/libxfs/xfs_btree_mem.c
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -18,6 +18,7 @@
#include "xfs_ag.h"
#include "xfs_buf_item.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
/* Set the root of an in-memory btree. */
void
@@ -57,10 +58,8 @@ xfbtree_dup_cursor(
ncur->bc_flags = cur->bc_flags;
ncur->bc_nlevels = cur->bc_nlevels;
ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
-
- if (cur->bc_mem.pag)
- ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
-
+ if (cur->bc_group)
+ ncur->bc_group = xfs_group_hold(cur->bc_group);
return ncur;
}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index 694929703152..5ed84f9cc877 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -134,6 +134,7 @@ xfs_btree_stage_ifakeroot(
cur->bc_ino.ifake = ifake;
cur->bc_nlevels = ifake->if_levels;
cur->bc_ino.forksize = ifake->if_fork_size;
+ cur->bc_ino.whichfork = XFS_STAGING_FORK;
cur->bc_flags |= XFS_BTREE_STAGING;
}
@@ -573,6 +574,7 @@ xfs_btree_bload_compute_geometry(
struct xfs_btree_bload *bbl,
uint64_t nr_records)
{
+ const struct xfs_btree_ops *ops = cur->bc_ops;
uint64_t nr_blocks = 0;
uint64_t nr_this_level;
@@ -599,7 +601,7 @@ xfs_btree_bload_compute_geometry(
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &level_blocks, &dontcare64);
- if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+ if (ops->type == XFS_BTREE_TYPE_INODE) {
/*
* If all the items we want to store at this level
* would fit in the inode root block, then we have our
@@ -607,7 +609,9 @@ xfs_btree_bload_compute_geometry(
*
* Note that bmap btrees forbid records in the root.
*/
- if (level != 0 && nr_this_level <= avg_per_block) {
+ if ((level != 0 ||
+ (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) &&
+ nr_this_level <= avg_per_block) {
nr_blocks++;
break;
}
@@ -658,7 +662,7 @@ xfs_btree_bload_compute_geometry(
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
- if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+ if (ops->type == XFS_BTREE_TYPE_INODE)
bbl->nr_blocks = nr_blocks - 1;
else
bbl->nr_blocks = nr_blocks;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 2cd212ad2c1d..5b377cbbb1f7 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -846,6 +846,12 @@ xfs_defer_add(
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ if (!ops->finish_item) {
+ ASSERT(ops->finish_item != NULL);
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ return NULL;
+ }
+
dfp = xfs_defer_find_last(tp, ops);
if (!dfp || !xfs_defer_can_append(dfp, ops))
dfp = xfs_defer_alloc(&tp->t_dfops, ops);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 8b338031e487..9effd95ddcd4 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -68,9 +68,12 @@ struct xfs_defer_op_type {
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
extern const struct xfs_defer_op_type xfs_attr_defer_type;
extern const struct xfs_defer_op_type xfs_exchmaps_defer_type;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 202468223bf9..1775abcfa04d 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -197,7 +197,7 @@ xfs_da_unmount(
/*
* Return 1 if directory contains only "." and "..".
*/
-int
+static bool
xfs_dir_isempty(
xfs_inode_t *dp)
{
@@ -205,9 +205,9 @@ xfs_dir_isempty(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (dp->i_disk_size == 0) /* might happen during shutdown. */
- return 1;
+ return true;
if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
- return 0;
+ return false;
sfp = dp->i_df.if_data;
return !sfp->count;
}
@@ -379,12 +379,11 @@ xfs_dir_cilookup_result(
!(args->op_flags & XFS_DA_OP_CILOOKUP))
return -EEXIST;
- args->value = kmalloc(len,
+ args->value = kmemdup(name, len,
GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
if (!args->value)
return -ENOMEM;
- memcpy(args->value, name, len);
args->valuelen = len;
return -EEXIST;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 576068ed81fa..a6594a5a941d 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -58,7 +58,6 @@ extern void xfs_dir_startup(void);
extern int xfs_da_mount(struct xfs_mount *mp);
extern void xfs_da_unmount(struct xfs_mount *mp);
-extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 15a362e2f5ea..dceef2abd4e2 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -16,6 +16,9 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_metadir.h"
+#include "xfs_metafile.h"
int
xfs_calc_dquots_per_chunk(
@@ -323,3 +326,190 @@ xfs_dquot_to_disk_ts(
return cpu_to_be32(t);
}
+
+inline unsigned int
+xfs_dqinode_sick_mask(xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return XFS_SICK_FS_UQUOTA;
+ case XFS_DQTYPE_GROUP:
+ return XFS_SICK_FS_GQUOTA;
+ case XFS_DQTYPE_PROJ:
+ return XFS_SICK_FS_PQUOTA;
+ }
+
+ ASSERT(0);
+ return 0;
+}
+
+/*
+ * Load the inode for a given type of quota, assuming that the sb fields have
+ * been sorted out. This is not true when switching quota types on a V4
+ * filesystem, so do not use this function for that. If metadir is enabled,
+ * @dp must be the /quota metadir.
+ *
+ * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on
+ * success; or a negative errno.
+ */
+int
+xfs_dqinode_load(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dqtype_t type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip;
+ enum xfs_metafile_type metafile_type = xfs_dqinode_metafile_type(type);
+ int error;
+
+ if (!xfs_has_metadir(mp)) {
+ xfs_ino_t ino;
+
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ ino = mp->m_sb.sb_uquotino;
+ break;
+ case XFS_DQTYPE_GROUP:
+ ino = mp->m_sb.sb_gquotino;
+ break;
+ case XFS_DQTYPE_PROJ:
+ ino = mp->m_sb.sb_pquotino;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ /* Should have set 0 to NULLFSINO when loading superblock */
+ if (ino == NULLFSINO)
+ return -ENOENT;
+
+ error = xfs_trans_metafile_iget(tp, ino, metafile_type, &ip);
+ } else {
+ error = xfs_metadir_load(tp, dp, xfs_dqinode_path(type),
+ metafile_type, &ip);
+ if (error == -ENOENT)
+ return error;
+ }
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type));
+ return error;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) {
+ xfs_irele(ip);
+ xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type));
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_projid != 0)) {
+ xfs_irele(ip);
+ xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type));
+ return -EFSCORRUPTED;
+ }
+
+ *ipp = ip;
+ return 0;
+}
+
+/* Create a metadata directory quota inode. */
+int
+xfs_dqinode_metadir_create(
+ struct xfs_inode *dp,
+ xfs_dqtype_t type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_metadir_update upd = {
+ .dp = dp,
+ .metafile_type = xfs_dqinode_metafile_type(type),
+ .path = xfs_dqinode_path(type),
+ };
+ int error;
+
+ error = xfs_metadir_start_create(&upd);
+ if (error)
+ return error;
+
+ error = xfs_metadir_create(&upd, S_IFREG);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE);
+
+ error = xfs_metadir_commit(&upd);
+ if (error)
+ return error;
+
+ xfs_finish_inode_setup(upd.ip);
+ *ipp = upd.ip;
+ return 0;
+}
+
+#ifndef __KERNEL__
+/* Link a metadata directory quota inode. */
+int
+xfs_dqinode_metadir_link(
+ struct xfs_inode *dp,
+ xfs_dqtype_t type,
+ struct xfs_inode *ip)
+{
+ struct xfs_metadir_update upd = {
+ .dp = dp,
+ .metafile_type = xfs_dqinode_metafile_type(type),
+ .path = xfs_dqinode_path(type),
+ .ip = ip,
+ };
+ int error;
+
+ error = xfs_metadir_start_link(&upd);
+ if (error)
+ return error;
+
+ error = xfs_metadir_link(&upd);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE);
+
+ return xfs_metadir_commit(&upd);
+}
+#endif /* __KERNEL__ */
+
+/* Create the parent directory for all quota inodes and load it. */
+int
+xfs_dqinode_mkdir_parent(
+ struct xfs_mount *mp,
+ struct xfs_inode **dpp)
+{
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_mkdir(mp->m_metadirip, "quota", dpp);
+}
+
+/*
+ * Load the parent directory of all quota inodes. Pass the inode to the caller
+ * because quota functions (e.g. QUOTARM) can be called on the quota files even
+ * if quotas are not enabled.
+ */
+int
+xfs_dqinode_load_parent(
+ struct xfs_trans *tp,
+ struct xfs_inode **dpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_load(tp, mp->m_metadirip, "quota", XFS_METAFILE_DIR,
+ dpp);
+}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 7002d7676a78..a53c5d40e084 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -64,7 +64,8 @@
#define XFS_ERRTAG_WB_DELAY_MS 42
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
-#define XFS_ERRTAG_MAX 45
+#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
+#define XFS_ERRTAG_MAX 46
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -113,5 +114,6 @@
#define XFS_RANDOM_WB_DELAY_MS 3000
#define XFS_RANDOM_WRITE_DELAY_MS 3000
#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1
+#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
index 2021396651de..3f1d6a98c118 100644
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -662,7 +662,9 @@ xfs_exchmaps_rmapbt_blocks(
if (!xfs_has_rmapbt(mp))
return 0;
if (XFS_IS_REALTIME_INODE(req->ip1))
- return 0;
+ return howmany_64(req->nr_exchanges,
+ XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) *
+ XFS_RTRMAPADD_SPACE_RES(mp);
return howmany_64(req->nr_exchanges,
XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index e1bfee0c3b1a..9566a7623365 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -174,6 +174,15 @@ typedef struct xfs_sb {
xfs_lsn_t sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
+ xfs_ino_t sb_metadirino; /* metadata directory tree root */
+
+ xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
+ xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
+ uint8_t sb_rgblklog; /* rt group number shift */
+ uint8_t sb_pad[7]; /* zeroes */
+ xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */
+ xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
+
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -259,7 +268,20 @@ struct xfs_dsb {
__be64 sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
- /* must be padded to 64 bit alignment */
+ __be64 sb_metadirino; /* metadata directory tree root */
+ __be32 sb_rgcount; /* # of realtime groups */
+ __be32 sb_rgextents; /* size of rtgroup in rtx */
+ __u8 sb_rgblklog; /* rt group number shift */
+ __u8 sb_pad[7]; /* zeroes */
+ __be64 sb_rtstart; /* start of internal RT section (FSB) */
+ __be64 sb_rtreserved; /* reserved (zoned) RT blocks */
+
+ /*
+ * The size of this structure must be padded to 64 bit alignment.
+ *
+ * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when
+ * adding new fields here.
+ */
};
#define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc)
@@ -278,7 +300,7 @@ struct xfs_dsb {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
+static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
@@ -287,12 +309,12 @@ static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
* Detect a mismatched features2 field. Older kernels read/wrote
* this into the wrong slot, so to be safe we keep them in sync.
*/
-static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp)
{
return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp)
{
return xfs_sb_is_v5(sbp) ||
(sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
@@ -342,8 +364,8 @@ static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
static inline bool
xfs_sb_has_compat_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_compat & feature) != 0;
}
@@ -360,8 +382,8 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_ro_compat & feature) != 0;
}
@@ -374,6 +396,10 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
+#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */
+#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */
+
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -382,13 +408,16 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
- XFS_SB_FEAT_INCOMPAT_PARENT)
+ XFS_SB_FEAT_INCOMPAT_PARENT | \
+ XFS_SB_FEAT_INCOMPAT_METADIR | \
+ XFS_SB_FEAT_INCOMPAT_ZONED | \
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
xfs_sb_has_incompat_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_incompat & feature) != 0;
}
@@ -399,8 +428,8 @@ xfs_sb_has_incompat_feature(
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_log_incompat & feature) != 0;
}
@@ -420,7 +449,7 @@ xfs_sb_add_incompat_log_features(
sbp->sb_features_log_incompat |= features;
}
-static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp)
{
return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
@@ -694,20 +723,57 @@ struct xfs_agfl {
/*
* Realtime bitmap information is accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format. Starting with the realtime groups feature,
+ * the words are stored in be32 ondisk.
*/
union xfs_rtword_raw {
__u32 old;
+ __be32 rtg;
};
/*
* Realtime summary counts are accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format. Starting with the realtime groups feature,
+ * the words are stored in be32 ondisk.
*/
union xfs_suminfo_raw {
__u32 old;
+ __be32 rtg;
+};
+
+/*
+ * Realtime allocation groups break the rt section into multiple pieces that
+ * could be locked independently. Realtime block group numbers are 32-bit
+ * quantities. Block numbers within a group are also 32-bit quantities, but
+ * the upper bit must never be set. rtgroup 0 might have a superblock in it,
+ * so the minimum size of an rtgroup is 2 rtx.
+ */
+#define XFS_MAX_RGBLOCKS ((xfs_rgblock_t)(1U << 31) - 1)
+#define XFS_MIN_RGEXTENTS ((xfs_rtxlen_t)2)
+#define XFS_MAX_RGNUMBER ((xfs_rgnumber_t)(-1U))
+
+#define XFS_RTSB_MAGIC 0x46726F67 /* 'Frog' */
+
+/*
+ * Realtime superblock - on disk version. Must be padded to 64 bit alignment.
+ * The first block of the realtime volume contains this superblock.
+ */
+struct xfs_rtsb {
+ __be32 rsb_magicnum; /* magic number == XFS_RTSB_MAGIC */
+ __le32 rsb_crc; /* superblock crc */
+
+ __be32 rsb_pad; /* zero */
+ unsigned char rsb_fname[XFSLABEL_MAX]; /* file system name */
+
+ uuid_t rsb_uuid; /* user-visible file system unique id */
+ uuid_t rsb_meta_uuid; /* metadata file system unique id */
+
+ /* must be padded to 64 bit alignment */
};
+#define XFS_RTSB_CRC_OFF offsetof(struct xfs_rtsb, rsb_crc)
+#define XFS_RTSB_DADDR ((xfs_daddr_t)0) /* daddr in rt section */
+
/*
* XFS Timestamps
* ==============
@@ -790,6 +856,31 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
}
+enum xfs_metafile_type {
+ XFS_METAFILE_UNKNOWN, /* unknown */
+ XFS_METAFILE_DIR, /* metadir directory */
+ XFS_METAFILE_USRQUOTA, /* user quota */
+ XFS_METAFILE_GRPQUOTA, /* group quota */
+ XFS_METAFILE_PRJQUOTA, /* project quota */
+ XFS_METAFILE_RTBITMAP, /* rt bitmap */
+ XFS_METAFILE_RTSUMMARY, /* rt summary */
+ XFS_METAFILE_RTRMAP, /* rt rmap */
+ XFS_METAFILE_RTREFCOUNT, /* rt refcount */
+
+ XFS_METAFILE_MAX
+} __packed;
+
+#define XFS_METAFILE_TYPE_STR \
+ { XFS_METAFILE_UNKNOWN, "unknown" }, \
+ { XFS_METAFILE_DIR, "dir" }, \
+ { XFS_METAFILE_USRQUOTA, "usrquota" }, \
+ { XFS_METAFILE_GRPQUOTA, "grpquota" }, \
+ { XFS_METAFILE_PRJQUOTA, "prjquota" }, \
+ { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \
+ { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \
+ { XFS_METAFILE_RTRMAP, "rtrmap" }, \
+ { XFS_METAFILE_RTREFCOUNT, "rtrefcount" }
+
/*
* On-disk inode structure.
*
@@ -812,7 +903,7 @@ struct xfs_dinode {
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
__u8 di_format; /* format of di_c data */
- __be16 di_onlink; /* old number of links to file */
+ __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */
__be32 di_uid; /* owner's user id */
__be32 di_gid; /* owner's group id */
__be32 di_nlink; /* number of links to file */
@@ -868,7 +959,12 @@ struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
- __be32 di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ __be32 di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ __be32 di_used_blocks;
+ };
__u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
@@ -917,7 +1013,8 @@ enum xfs_dinode_fmt {
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
- XFS_DINODE_FMT_UUID /* added long ago, but never used */
+ XFS_DINODE_FMT_UUID, /* added long ago, but never used */
+ XFS_DINODE_FMT_META_BTREE, /* metadata btree */
};
#define XFS_INODE_FORMAT_STR \
@@ -925,7 +1022,8 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_LOCAL, "local" }, \
{ XFS_DINODE_FMT_EXTENTS, "extent" }, \
{ XFS_DINODE_FMT_BTREE, "btree" }, \
- { XFS_DINODE_FMT_UUID, "uuid" }
+ { XFS_DINODE_FMT_UUID, "uuid" }, \
+ { XFS_DINODE_FMT_META_BTREE, "meta_btree" }
/*
* Max values for extnum and aextnum.
@@ -1088,21 +1186,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* Values for di_flags2 These start by being exposed to userspace in the upper
* 16 bits of the XFS_XFLAG_s range.
*/
-#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
-#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
-#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
-#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
-#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
+/* use DAX for this inode */
+#define XFS_DIFLAG2_DAX_BIT 0
+
+/* file's blocks may be shared */
+#define XFS_DIFLAG2_REFLINK_BIT 1
+
+/* copy on write extent size hint */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT 2
-#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
-#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
-#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
-#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
+/* big timestamps */
+#define XFS_DIFLAG2_BIGTIME_BIT 3
+
+/* large extent counters */
+#define XFS_DIFLAG2_NREXT64_BIT 4
+
+/*
+ * The inode contains filesystem metadata and can be found through the metadata
+ * directory tree. Metadata inodes must satisfy the following constraints:
+ *
+ * - V5 filesystem (and ftype) are enabled;
+ * - The only valid modes are regular files and directories;
+ * - The access bits must be zero;
+ * - DMAPI event and state masks are zero;
+ * - The user and group IDs must be zero;
+ * - The project ID can be used as a u32 annotation;
+ * - The immutable, sync, noatime, nodump, nodefrag flags must be set.
+ * - The dax flag must not be set.
+ * - Directories must have nosymlinks set.
+ *
+ * These requirements are chosen defensively to minimize the ability of
+ * userspace to read or modify the contents, should a metadata file ever
+ * escape to userspace.
+ *
+ * There are further constraints on the directory tree itself:
+ *
+ * - Metadata inodes must never be resolvable through the root directory;
+ * - They must never be accessed by userspace;
+ * - Metadata directory entries must have correct ftype.
+ *
+ * Superblock-rooted metadata files must have the METADATA iflag set even
+ * though they do not have a parent directory.
+ */
+#define XFS_DIFLAG2_METADATA_BIT 5
+
+#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1117,6 +1254,12 @@ static inline bool xfs_dinode_has_large_extent_counts(
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
}
+static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -1165,6 +1308,24 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
/*
+ * RT bit manipulation macros.
+ */
+#define XFS_RTBITMAP_MAGIC 0x424D505A /* BMPZ */
+#define XFS_RTSUMMARY_MAGIC 0x53554D59 /* SUMY */
+
+struct xfs_rtbuf_blkinfo {
+ __be32 rt_magic; /* validity check on block */
+ __be32 rt_crc; /* CRC of block */
+ __be64 rt_owner; /* inode that owns the block */
+ __be64 rt_blkno; /* first block of the buffer */
+ __be64 rt_lsn; /* sequence number of last write */
+ uuid_t rt_uuid; /* filesystem we belong to */
+};
+
+#define XFS_RTBUF_CRC_OFF \
+ offsetof(struct xfs_rtbuf_blkinfo, rt_crc)
+
+/*
* Dquot and dquot block format definitions
*/
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
@@ -1583,6 +1744,24 @@ typedef __be32 xfs_rmap_ptr_t;
XFS_IBT_BLOCK(mp) + 1)
/*
+ * Realtime Reverse mapping btree format definitions
+ *
+ * This is a btree for reverse mapping records for realtime volumes
+ */
+#define XFS_RTRMAP_CRC_MAGIC 0x4d415052 /* 'MAPR' */
+
+/*
+ * rtrmap root header, on-disk form only.
+ */
+struct xfs_rtrmap_root {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+};
+
+/* inode-based btree pointer type */
+typedef __be64 xfs_rtrmap_ptr_t;
+
+/*
* Reference Count Btree format definitions
*
*/
@@ -1625,12 +1804,29 @@ struct xfs_refcount_key {
__be32 rc_startblock; /* starting block number */
};
-#define MAXREFCOUNT ((xfs_nlink_t)~0U)
-#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
+#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U)
+#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U)
/* btree pointer type */
typedef __be32 xfs_refcount_ptr_t;
+/*
+ * Realtime Reference Count btree format definitions
+ *
+ * This is a btree for reference count records for realtime volumes
+ */
+#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */
+
+/*
+ * rt refcount root header, on-disk form only.
+ */
+struct xfs_rtrefcount_root {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+};
+
+/* inode-rooted btree pointer type */
+typedef __be64 xfs_rtrefcount_ptr_t;
/*
* BMAP Btree format definitions
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 860284064c5a..12463ba766da 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -187,7 +187,11 @@ struct xfs_fsop_geom {
__u32 logsunit; /* log stripe unit, bytes */
uint32_t sick; /* o: unhealthy fs & rt metadata */
uint32_t checked; /* o: checked fs & rt metadata */
- __u64 reserved[17]; /* reserved space */
+ __u32 rgextents; /* rt extents in a realtime group */
+ __u32 rgcount; /* number of realtime groups */
+ __u64 rtstart; /* start of internal rt section */
+ __u64 rtreserved; /* RT (zoned) reserved blocks */
+ __u64 reserved[14]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@@ -198,6 +202,8 @@ struct xfs_fsop_geom {
#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */
#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */
+#define XFS_FSOP_GEOM_SICK_METADIR (1 << 8) /* metadata directory */
+#define XFS_FSOP_GEOM_SICK_METAPATH (1 << 9) /* metadir tree path */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -242,6 +248,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
+#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
+#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
/*
* Minimum and maximum sizes need for growth checks.
@@ -489,9 +497,17 @@ struct xfs_bulk_ireq {
*/
#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+/*
+ * Allow bulkstat to return information about metadata directories. This
+ * enables xfs_scrub to find them for scanning, as they are otherwise ordinary
+ * directories.
+ */
+#define XFS_BULK_IREQ_METADIR (1U << 3)
+
#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
XFS_BULK_IREQ_SPECIAL | \
- XFS_BULK_IREQ_NREXT64)
+ XFS_BULK_IREQ_NREXT64 | \
+ XFS_BULK_IREQ_METADIR)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -722,9 +738,13 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */
+#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */
+#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */
+#define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */
+#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 29
+#define XFS_SCRUB_TYPE_NR 33
/*
* This special type code only applies to the vectored scrub implementation.
@@ -803,6 +823,24 @@ struct xfs_scrub_vec_head {
#define XFS_SCRUB_VEC_FLAGS_ALL (0)
/*
+ * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for
+ * path checking.
+ */
+#define XFS_SCRUB_METAPATH_PROBE (0) /* do we have a metapath scrubber? */
+#define XFS_SCRUB_METAPATH_RTDIR (1) /* rtrgroups metadir */
+#define XFS_SCRUB_METAPATH_RTBITMAP (2) /* per-rtg bitmap */
+#define XFS_SCRUB_METAPATH_RTSUMMARY (3) /* per-rtg summary */
+#define XFS_SCRUB_METAPATH_QUOTADIR (4) /* quota metadir */
+#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */
+#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */
+#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */
+#define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */
+#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */
+
+/* Number of metapath sm_ino values */
+#define XFS_SCRUB_METAPATH_NR (10)
+
+/*
* ioctl limits
*/
#ifdef XATTR_LIST_MAX
@@ -949,6 +987,23 @@ struct xfs_getparents_by_handle {
};
/*
+ * Output for XFS_IOC_RTGROUP_GEOMETRY
+ */
+struct xfs_rtgroup_geometry {
+ __u32 rg_number; /* i/o: rtgroup number */
+ __u32 rg_length; /* o: length in blocks */
+ __u32 rg_sick; /* o: sick things in ag */
+ __u32 rg_checked; /* o: checked metadata in ag */
+ __u32 rg_flags; /* i/o: flags for this ag */
+ __u32 rg_reserved[27]; /* o: zero */
+};
+#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */
+#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */
+#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */
+#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */
+#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
@@ -986,6 +1041,7 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents)
#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head)
+#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -1026,6 +1082,15 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
+/*
+ * Devices supported by a single XFS file system. Reported in fsmaps fmr_device
+ * when using internal RT devices.
+ */
+enum xfs_device {
+ XFS_DEV_DATA = 1,
+ XFS_DEV_LOG = 2,
+ XFS_DEV_RT = 3,
+};
#ifndef HAVE_BBMACROS
/*
diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c
new file mode 100644
index 000000000000..e9d76bcdc820
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_group.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_extent_busy.h"
+#include "xfs_group.h"
+
+/*
+ * Groups can have passive and active references.
+ *
+ * For passive references the code freeing a group is responsible for cleaning
+ * up objects that hold the passive references (e.g. cached buffers).
+ * Routines manipulating passive references are xfs_group_get, xfs_group_hold
+ * and xfs_group_put.
+ *
+ * Active references are for short term access to the group for walking trees or
+ * accessing state. If a group is being shrunk or offlined, the lookup will fail
+ * to find that group and return NULL instead.
+ * Routines manipulating active references are xfs_group_grab and
+ * xfs_group_rele.
+ */
+
+struct xfs_group *
+xfs_group_get(
+ struct xfs_mount *mp,
+ uint32_t index,
+ enum xfs_group_type type)
+{
+ struct xfs_group *xg;
+
+ rcu_read_lock();
+ xg = xa_load(&mp->m_groups[type].xa, index);
+ if (xg) {
+ trace_xfs_group_get(xg, _RET_IP_);
+ ASSERT(atomic_read(&xg->xg_ref) >= 0);
+ atomic_inc(&xg->xg_ref);
+ }
+ rcu_read_unlock();
+ return xg;
+}
+
+struct xfs_group *
+xfs_group_hold(
+ struct xfs_group *xg)
+{
+ ASSERT(atomic_read(&xg->xg_ref) > 0 ||
+ atomic_read(&xg->xg_active_ref) > 0);
+
+ trace_xfs_group_hold(xg, _RET_IP_);
+ atomic_inc(&xg->xg_ref);
+ return xg;
+}
+
+void
+xfs_group_put(
+ struct xfs_group *xg)
+{
+ trace_xfs_group_put(xg, _RET_IP_);
+
+ ASSERT(atomic_read(&xg->xg_ref) > 0);
+ atomic_dec(&xg->xg_ref);
+}
+
+struct xfs_group *
+xfs_group_grab(
+ struct xfs_mount *mp,
+ uint32_t index,
+ enum xfs_group_type type)
+{
+ struct xfs_group *xg;
+
+ rcu_read_lock();
+ xg = xa_load(&mp->m_groups[type].xa, index);
+ if (xg) {
+ trace_xfs_group_grab(xg, _RET_IP_);
+ if (!atomic_inc_not_zero(&xg->xg_active_ref))
+ xg = NULL;
+ }
+ rcu_read_unlock();
+ return xg;
+}
+
+/*
+ * Iterate to the next group. To start the iteration at @start_index, a %NULL
+ * @xg is passed, else the previous group returned from this function. The
+ * caller should break out of the loop when this returns %NULL. If the caller
+ * wants to break out of a loop that did not finish it needs to release the
+ * active reference to @xg using xfs_group_rele() itself.
+ */
+struct xfs_group *
+xfs_group_next_range(
+ struct xfs_mount *mp,
+ struct xfs_group *xg,
+ uint32_t start_index,
+ uint32_t end_index,
+ enum xfs_group_type type)
+{
+ uint32_t index = start_index;
+
+ if (xg) {
+ index = xg->xg_gno + 1;
+ xfs_group_rele(xg);
+ }
+ if (index > end_index)
+ return NULL;
+ return xfs_group_grab(mp, index, type);
+}
+
+/*
+ * Find the next group after @xg, or the first group if @xg is NULL.
+ */
+struct xfs_group *
+xfs_group_grab_next_mark(
+ struct xfs_mount *mp,
+ struct xfs_group *xg,
+ xa_mark_t mark,
+ enum xfs_group_type type)
+{
+ unsigned long index = 0;
+
+ if (xg) {
+ index = xg->xg_gno + 1;
+ xfs_group_rele(xg);
+ }
+
+ rcu_read_lock();
+ xg = xa_find(&mp->m_groups[type].xa, &index, ULONG_MAX, mark);
+ if (xg) {
+ trace_xfs_group_grab_next_tag(xg, _RET_IP_);
+ if (!atomic_inc_not_zero(&xg->xg_active_ref))
+ xg = NULL;
+ }
+ rcu_read_unlock();
+ return xg;
+}
+
+void
+xfs_group_rele(
+ struct xfs_group *xg)
+{
+ trace_xfs_group_rele(xg, _RET_IP_);
+ atomic_dec(&xg->xg_active_ref);
+}
+
+void
+xfs_group_free(
+ struct xfs_mount *mp,
+ uint32_t index,
+ enum xfs_group_type type,
+ void (*uninit)(struct xfs_group *xg))
+{
+ struct xfs_group *xg = xa_erase(&mp->m_groups[type].xa, index);
+
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_ref) != 0);
+
+ xfs_defer_drain_free(&xg->xg_intents_drain);
+#ifdef __KERNEL__
+ kfree(xg->xg_busy_extents);
+#endif
+
+ if (uninit)
+ uninit(xg);
+
+ /* drop the mount's active reference */
+ xfs_group_rele(xg);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0);
+ kfree_rcu_mightsleep(xg);
+}
+
+int
+xfs_group_insert(
+ struct xfs_mount *mp,
+ struct xfs_group *xg,
+ uint32_t index,
+ enum xfs_group_type type)
+{
+ int error;
+
+ xg->xg_mount = mp;
+ xg->xg_gno = index;
+ xg->xg_type = type;
+
+#ifdef __KERNEL__
+ xg->xg_busy_extents = xfs_extent_busy_alloc();
+ if (!xg->xg_busy_extents)
+ return -ENOMEM;
+ spin_lock_init(&xg->xg_state_lock);
+ xfs_hooks_init(&xg->xg_rmap_update_hooks);
+#endif
+ xfs_defer_drain_init(&xg->xg_intents_drain);
+
+ /* Active ref owned by mount indicates group is online. */
+ atomic_set(&xg->xg_active_ref, 1);
+
+ error = xa_insert(&mp->m_groups[type].xa, index, xg, GFP_KERNEL);
+ if (error) {
+ WARN_ON_ONCE(error == -EBUSY);
+ goto out_drain;
+ }
+
+ return 0;
+out_drain:
+ xfs_defer_drain_free(&xg->xg_intents_drain);
+#ifdef __KERNEL__
+ kfree(xg->xg_busy_extents);
+#endif
+ return error;
+}
+
+struct xfs_group *
+xfs_group_get_by_fsb(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
+{
+ return xfs_group_get(mp, xfs_fsb_to_gno(mp, fsbno, type), type);
+}
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
new file mode 100644
index 000000000000..4423932a2313
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ */
+#ifndef __LIBXFS_GROUP_H
+#define __LIBXFS_GROUP_H 1
+
+struct xfs_group {
+ struct xfs_mount *xg_mount;
+ uint32_t xg_gno;
+ enum xfs_group_type xg_type;
+ atomic_t xg_ref; /* passive reference count */
+ atomic_t xg_active_ref; /* active reference count */
+
+ /* Precalculated geometry info */
+ uint32_t xg_block_count; /* max usable gbno */
+ uint32_t xg_min_gbno; /* min usable gbno */
+
+#ifdef __KERNEL__
+ /* -- kernel only structures below this line -- */
+
+ union {
+ /*
+ * For perags and non-zoned RT groups:
+ * Track freed but not yet committed extents.
+ */
+ struct xfs_extent_busy_tree *xg_busy_extents;
+
+ /*
+ * For zoned RT groups:
+ * List of groups that need a zone reset.
+ *
+ * The zonegc code forces a log flush of the rtrmap inode before
+ * resetting the write pointer, so there is no need for
+ * individual busy extent tracking.
+ */
+ struct xfs_group *xg_next_reset;
+ };
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold xg_state_lock before accessing this field.
+ */
+ uint16_t xg_checked;
+ uint16_t xg_sick;
+ spinlock_t xg_state_lock;
+
+ /*
+ * We use xfs_drain to track the number of deferred log intent items
+ * that have been queued (but not yet processed) so that waiters (e.g.
+ * scrub) will not lock resources when other threads are in the middle
+ * of processing a chain of intent items only to find momentary
+ * inconsistencies.
+ */
+ struct xfs_defer_drain xg_intents_drain;
+
+ /*
+ * Hook to feed rmapbt updates to an active online repair.
+ */
+ struct xfs_hooks xg_rmap_update_hooks;
+#endif /* __KERNEL__ */
+};
+
+struct xfs_group *xfs_group_get(struct xfs_mount *mp, uint32_t index,
+ enum xfs_group_type type);
+struct xfs_group *xfs_group_get_by_fsb(struct xfs_mount *mp,
+ xfs_fsblock_t fsbno, enum xfs_group_type type);
+struct xfs_group *xfs_group_hold(struct xfs_group *xg);
+void xfs_group_put(struct xfs_group *xg);
+
+struct xfs_group *xfs_group_grab(struct xfs_mount *mp, uint32_t index,
+ enum xfs_group_type type);
+struct xfs_group *xfs_group_next_range(struct xfs_mount *mp,
+ struct xfs_group *xg, uint32_t start_index, uint32_t end_index,
+ enum xfs_group_type type);
+struct xfs_group *xfs_group_grab_next_mark(struct xfs_mount *mp,
+ struct xfs_group *xg, xa_mark_t mark, enum xfs_group_type type);
+void xfs_group_rele(struct xfs_group *xg);
+
+void xfs_group_free(struct xfs_mount *mp, uint32_t index,
+ enum xfs_group_type type, void (*uninit)(struct xfs_group *xg));
+int xfs_group_insert(struct xfs_mount *mp, struct xfs_group *xg,
+ uint32_t index, enum xfs_group_type);
+
+#define xfs_group_set_mark(_xg, _mark) \
+ xa_set_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \
+ (_xg)->xg_gno, (_mark))
+#define xfs_group_clear_mark(_xg, _mark) \
+ xa_clear_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \
+ (_xg)->xg_gno, (_mark))
+#define xfs_group_marked(_mp, _type, _mark) \
+ xa_marked(&(_mp)->m_groups[(_type)].xa, (_mark))
+
+static inline xfs_agblock_t
+xfs_group_max_blocks(
+ struct xfs_group *xg)
+{
+ return xg->xg_mount->m_groups[xg->xg_type].blocks;
+}
+
+static inline xfs_fsblock_t
+xfs_group_start_fsb(
+ struct xfs_group *xg)
+{
+ return ((xfs_fsblock_t)xg->xg_gno) <<
+ xg->xg_mount->m_groups[xg->xg_type].blklog;
+}
+
+static inline xfs_fsblock_t
+xfs_gbno_to_fsb(
+ struct xfs_group *xg,
+ xfs_agblock_t gbno)
+{
+ return xfs_group_start_fsb(xg) | gbno;
+}
+
+static inline xfs_daddr_t
+xfs_gbno_to_daddr(
+ struct xfs_group *xg,
+ xfs_agblock_t gbno)
+{
+ struct xfs_mount *mp = xg->xg_mount;
+ struct xfs_groups *g = &mp->m_groups[xg->xg_type];
+ xfs_fsblock_t fsbno;
+
+ if (g->has_daddr_gaps)
+ fsbno = xfs_gbno_to_fsb(xg, gbno);
+ else
+ fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
+
+ return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
+}
+
+static inline uint32_t
+xfs_fsb_to_gno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
+{
+ if (!mp->m_groups[type].blklog)
+ return 0;
+ return fsbno >> mp->m_groups[type].blklog;
+}
+
+static inline xfs_agblock_t
+xfs_fsb_to_gbno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
+{
+ return fsbno & mp->m_groups[type].blkmask;
+}
+
+static inline bool
+xfs_verify_gbno(
+ struct xfs_group *xg,
+ uint32_t gbno)
+{
+ if (gbno >= xg->xg_block_count)
+ return false;
+ if (gbno < xg->xg_min_gbno)
+ return false;
+ return true;
+}
+
+static inline bool
+xfs_verify_gbext(
+ struct xfs_group *xg,
+ uint32_t gbno,
+ uint32_t glen)
+{
+ uint32_t end;
+
+ if (!xfs_verify_gbno(xg, gbno))
+ return false;
+ if (glen == 0 || check_add_overflow(gbno, glen - 1, &end))
+ return false;
+ if (!xfs_verify_gbno(xg, end))
+ return false;
+ return true;
+}
+
+#endif /* __LIBXFS_GROUP_H */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index b0edb4288e59..b31000f7190c 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -6,6 +6,8 @@
#ifndef __XFS_HEALTH_H__
#define __XFS_HEALTH_H__
+struct xfs_group;
+
/*
* In-Core Filesystem Health Assessments
* =====================================
@@ -52,6 +54,7 @@ struct xfs_inode;
struct xfs_fsop_geom;
struct xfs_btree_cur;
struct xfs_da_args;
+struct xfs_rtgroup;
/* Observable health issues for metadata spanning the entire filesystem. */
#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
@@ -60,10 +63,15 @@ struct xfs_da_args;
#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */
#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */
+#define XFS_SICK_FS_METADIR (1 << 6) /* metadata directory tree */
+#define XFS_SICK_FS_METAPATH (1 << 7) /* metadata directory tree path */
-/* Observable health issues for realtime volume metadata. */
-#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
-#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */
+/* Observable health issues for realtime group metadata. */
+#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */
+#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */
+#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */
+#define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */
+#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */
/* Observable health issues for AG metadata. */
#define XFS_SICK_AG_SB (1 << 0) /* superblock */
@@ -103,10 +111,15 @@ struct xfs_da_args;
XFS_SICK_FS_GQUOTA | \
XFS_SICK_FS_PQUOTA | \
XFS_SICK_FS_QUOTACHECK | \
- XFS_SICK_FS_NLINKS)
+ XFS_SICK_FS_NLINKS | \
+ XFS_SICK_FS_METADIR | \
+ XFS_SICK_FS_METAPATH)
-#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
- XFS_SICK_RT_SUMMARY)
+#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \
+ XFS_SICK_RG_BITMAP | \
+ XFS_SICK_RG_SUMMARY | \
+ XFS_SICK_RG_RMAPBT | \
+ XFS_SICK_RG_REFCNTBT)
#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
XFS_SICK_AG_AGF | \
@@ -136,26 +149,26 @@ struct xfs_da_args;
/* Secondary state related to (but not primary evidence of) health problems. */
#define XFS_SICK_FS_SECONDARY (0)
-#define XFS_SICK_RT_SECONDARY (0)
+#define XFS_SICK_RG_SECONDARY (0)
#define XFS_SICK_AG_SECONDARY (0)
#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET)
/* Evidence of health problems elsewhere. */
#define XFS_SICK_FS_INDIRECT (0)
-#define XFS_SICK_RT_INDIRECT (0)
+#define XFS_SICK_RG_INDIRECT (0)
#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES)
#define XFS_SICK_INO_INDIRECT (0)
/* All health masks. */
-#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
+#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
XFS_SICK_FS_SECONDARY | \
XFS_SICK_FS_INDIRECT)
-#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \
- XFS_SICK_RT_SECONDARY | \
- XFS_SICK_RT_INDIRECT)
+#define XFS_SICK_RG_ALL (XFS_SICK_RG_PRIMARY | \
+ XFS_SICK_RG_SECONDARY | \
+ XFS_SICK_RG_INDIRECT)
-#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
+#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
XFS_SICK_AG_SECONDARY | \
XFS_SICK_AG_INDIRECT)
@@ -189,18 +202,17 @@ void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
-void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
-void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
-void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
-void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
- unsigned int *checked);
+void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ unsigned int mask);
void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
unsigned int mask);
-void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
-void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask);
-void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
-void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+void xfs_group_mark_sick(struct xfs_group *xg, unsigned int mask);
+#define xfs_ag_mark_sick(pag, mask) \
+ xfs_group_mark_sick(pag_group(pag), (mask))
+void xfs_group_mark_corrupt(struct xfs_group *xg, unsigned int mask);
+void xfs_group_mark_healthy(struct xfs_group *xg, unsigned int mask);
+void xfs_group_measure_sickness(struct xfs_group *xg, unsigned int *sick,
unsigned int *checked);
void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
@@ -227,22 +239,25 @@ xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
}
static inline bool
-xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+xfs_group_has_sickness(
+ struct xfs_group *xg,
+ unsigned int mask)
{
- unsigned int sick, checked;
+ unsigned int sick, checked;
- xfs_rt_measure_sickness(mp, &sick, &checked);
+ xfs_group_measure_sickness(xg, &sick, &checked);
return sick & mask;
}
-static inline bool
-xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
-{
- unsigned int sick, checked;
+#define xfs_ag_has_sickness(pag, mask) \
+ xfs_group_has_sickness(pag_group(pag), (mask))
+#define xfs_ag_is_healthy(pag) \
+ (!xfs_ag_has_sickness((pag), UINT_MAX))
- xfs_ag_measure_sickness(pag, &sick, &checked);
- return sick & mask;
-}
+#define xfs_rtgroup_has_sickness(rtg, mask) \
+ xfs_group_has_sickness(rtg_group(rtg), (mask))
+#define xfs_rtgroup_is_healthy(rtg) \
+ (!xfs_rtgroup_has_sickness((rtg), UINT_MAX))
static inline bool
xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
@@ -260,18 +275,6 @@ xfs_fs_is_healthy(struct xfs_mount *mp)
}
static inline bool
-xfs_rt_is_healthy(struct xfs_mount *mp)
-{
- return !xfs_rt_has_sickness(mp, -1U);
-}
-
-static inline bool
-xfs_ag_is_healthy(struct xfs_perag *pag)
-{
- return !xfs_ag_has_sickness(pag, -1U);
-}
-
-static inline bool
xfs_inode_is_healthy(struct xfs_inode *ip)
{
return !xfs_inode_has_sickness(ip, -1U);
@@ -279,6 +282,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip)
void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo);
void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
#define xfs_metadata_is_sick(error) \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 271855227514..0c47b5c6ca7d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -142,7 +142,7 @@ xfs_inobt_complain_bad_rec(
xfs_warn(mp,
"%sbt record corruption in AG %d detected at %pS!",
- cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_ops->name, cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -170,7 +170,7 @@ xfs_inobt_get_rec(
return error;
xfs_inobt_btrec_to_irec(mp, rec, irec);
- fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
+ fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, irec);
@@ -275,8 +275,10 @@ xfs_check_agi_freecount(
}
} while (i == 1);
- if (!xfs_is_shutdown(cur->bc_mp))
- ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
+ if (!xfs_is_shutdown(cur->bc_mp)) {
+ ASSERT(freecount ==
+ to_perag(cur->bc_group)->pagi_freecount);
+ }
}
return 0;
}
@@ -362,7 +364,7 @@ xfs_ialloc_inode_init(
(j * M_IGEO(mp)->blocks_per_cluster));
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
- XBF_UNMAPPED, &fbuf);
+ 0, &fbuf);
if (error)
return error;
@@ -551,7 +553,7 @@ xfs_inobt_insert_sprec(
struct xfs_buf *agbp,
struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
int error;
int i;
@@ -606,15 +608,12 @@ xfs_inobt_insert_sprec(
goto error;
}
- trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
- rec.ir_holemask, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_pre(pag, &rec, nrec);
/* merge to nrec to output the updated record */
__xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_post(pag, nrec);
error = xfs_inobt_rec_check_count(mp, nrec);
if (error)
@@ -648,7 +647,7 @@ xfs_finobt_insert_sprec(
struct xfs_buf *agbp,
struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
int error;
int i;
@@ -768,8 +767,7 @@ xfs_ialloc_ag_alloc(
/* Allow space for the inode btree to split. */
args.minleft = igeo->inobt_maxlevels;
error = xfs_alloc_vextent_exact_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- args.agbno));
+ xfs_agbno_to_fsb(pag, args.agbno));
if (error)
return error;
@@ -811,8 +809,8 @@ xfs_ialloc_ag_alloc(
*/
args.minleft = igeo->inobt_maxlevels;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- be32_to_cpu(agi->agi_root)));
+ xfs_agbno_to_fsb(pag,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
}
@@ -824,8 +822,8 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.alignment = igeo->cluster_align;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- be32_to_cpu(agi->agi_root)));
+ xfs_agbno_to_fsb(pag,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
}
@@ -855,13 +853,14 @@ sparse_alloc:
* the end of the AG.
*/
args.min_agbno = args.mp->m_sb.sb_inoalignmt;
- args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+ args.max_agbno = round_down(xfs_ag_block_count(args.mp,
+ pag_agno(pag)),
args.mp->m_sb.sb_inoalignmt) -
igeo->ialloc_blks;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- be32_to_cpu(agi->agi_root)));
+ xfs_agbno_to_fsb(pag,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
@@ -884,7 +883,7 @@ sparse_alloc:
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag),
args.agbno, args.len, get_random_u32());
if (error)
@@ -915,8 +914,7 @@ sparse_alloc:
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
- XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
- rec.ir_startino),
+ xfs_agino_to_ino(pag, rec.ir_startino),
rec.ir_holemask, rec.ir_count);
xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
}
@@ -1076,7 +1074,7 @@ xfs_dialloc_check_ino(
if (error)
return -EAGAIN;
- error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
+ error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp);
if (error)
return -EAGAIN;
@@ -1127,7 +1125,7 @@ xfs_dialloc_ag_inobt(
/*
* If in the same AG as the parent, try to get near the parent.
*/
- if (pagno == pag->pag_agno) {
+ if (pagno == pag_agno(pag)) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
@@ -1335,7 +1333,7 @@ alloc_inode:
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
@@ -1604,7 +1602,7 @@ xfs_dialloc_ag(
* parent. If so, find the closest available inode to the parent. If
* not, consider the agi hint or find the first free inode in the AG.
*/
- if (pag->pag_agno == pagno)
+ if (pag_agno(pag) == pagno)
error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
else
error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
@@ -1616,7 +1614,7 @@ xfs_dialloc_ag(
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
@@ -1845,6 +1843,40 @@ out_release:
}
/*
+ * Pick an AG for the new inode.
+ *
+ * Directories, symlinks, and regular files frequently allocate at least one
+ * block, so factor that potential expansion when we examine whether an AG has
+ * enough space for file creation. Try to keep metadata files all in the same
+ * AG.
+ */
+static inline xfs_agnumber_t
+xfs_dialloc_pick_ag(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp,
+ umode_t mode)
+{
+ xfs_agnumber_t start_agno;
+
+ if (!dp)
+ return 0;
+ if (xfs_is_metadir_inode(dp)) {
+ if (mp->m_sb.sb_logstart)
+ return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+ return 0;
+ }
+
+ if (S_ISDIR(mode))
+ return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
+
+ start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
+ if (start_agno >= mp->m_maxagi)
+ start_agno = 0;
+
+ return start_agno;
+}
+
+/*
* Allocate an on-disk inode.
*
* Mode is used to tell whether the new inode is a directory and hence where to
@@ -1859,31 +1891,19 @@ xfs_dialloc(
xfs_ino_t *new_ino)
{
struct xfs_mount *mp = (*tpp)->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_ino_t ino = NULLFSINO;
xfs_ino_t parent = args->pip ? args->pip->i_ino : 0;
- umode_t mode = args->mode & S_IFMT;
xfs_agnumber_t agno;
- int error = 0;
xfs_agnumber_t start_agno;
- struct xfs_perag *pag;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ umode_t mode = args->mode & S_IFMT;
bool ok_alloc = true;
bool low_space = false;
int flags;
- xfs_ino_t ino = NULLFSINO;
+ int error = 0;
- /*
- * Directories, symlinks, and regular files frequently allocate at least
- * one block, so factor that potential expansion when we examine whether
- * an AG has enough space for file creation.
- */
- if (S_ISDIR(mode))
- start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
- mp->m_maxagi;
- else {
- start_agno = XFS_INO_TO_AGNO(mp, parent);
- if (start_agno >= mp->m_maxagi)
- start_agno = 0;
- }
+ start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode);
/*
* If we have already hit the ceiling of inode blocks then clear
@@ -1907,7 +1927,7 @@ xfs_dialloc(
* that we can immediately allocate, but then we allow allocation on the
* second pass if we fail to find an AG with free inodes in it.
*/
- if (percpu_counter_read_positive(&mp->m_fdblocks) <
+ if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
ok_alloc = false;
low_space = true;
@@ -1974,7 +1994,7 @@ retry:
static int
xfs_difree_inode_chunk(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_inobt_rec_incore *rec)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1988,8 +2008,7 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- return xfs_free_extent_later(tp,
- XFS_AGB_TO_FSB(mp, agno, sagbno),
+ return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno),
M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
XFS_AG_RESV_NONE, 0);
}
@@ -2035,9 +2054,9 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- error = xfs_free_extent_later(tp,
- XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
+ error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno),
+ contigblk, &XFS_RMAP_OINFO_INODES,
+ XFS_AG_RESV_NONE, 0);
if (error)
return error;
@@ -2059,7 +2078,7 @@ xfs_difree_inobt(
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_agi *agi = agbp->b_addr;
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
@@ -2124,8 +2143,7 @@ xfs_difree_inobt(
if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
xic->deleted = true;
- xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
- rec.ir_startino);
+ xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
@@ -2148,7 +2166,7 @@ xfs_difree_inobt(
goto error0;
}
- error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ error = xfs_difree_inode_chunk(tp, pag, &rec);
if (error)
goto error0;
} else {
@@ -2194,7 +2212,7 @@ xfs_difree_finobt(
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int offset = agino - ibtrec->ir_startino;
@@ -2317,24 +2335,24 @@ xfs_difree(
/*
* Break up inode number into its components.
*/
- if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
- xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
- __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
+ if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) {
+ xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).",
+ __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag));
ASSERT(0);
return -EINVAL;
}
agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
- xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ if (inode != xfs_agino_to_ino(pag, agino)) {
+ xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
__func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+ (unsigned long long)xfs_agino_to_ino(pag, agino));
ASSERT(0);
return -EINVAL;
}
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks) {
- xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
- __func__, agbno, mp->m_sb.sb_agblocks);
+ if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
+ xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
+ __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag)));
ASSERT(0);
return -EINVAL;
}
@@ -2380,7 +2398,7 @@ xfs_imap_lookup(
xfs_agblock_t *offset_agbno,
int flags)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_inobt_rec_incore rec;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
@@ -2391,7 +2409,7 @@ xfs_imap_lookup(
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
- __func__, error, pag->pag_agno);
+ __func__, error, pag_agno(pag));
return error;
}
@@ -2441,7 +2459,7 @@ xfs_imap(
struct xfs_imap *imap, /* location map structure */
uint flags) /* flags for inode btree lookup */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
xfs_agblock_t agbno; /* block number of inode in the alloc group */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
@@ -2457,8 +2475,8 @@ xfs_imap(
*/
agino = XFS_INO_TO_AGINO(mp, ino);
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks ||
- ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) ||
+ ino != xfs_agino_to_ino(pag, agino)) {
error = -EINVAL;
#ifdef DEBUG
/*
@@ -2467,17 +2485,18 @@ xfs_imap(
*/
if (flags & XFS_IGET_UNTRUSTED)
return error;
- if (agbno >= mp->m_sb.sb_agblocks) {
+ if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
xfs_alert(mp,
"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
__func__, (unsigned long long)agbno,
- (unsigned long)mp->m_sb.sb_agblocks);
+ (unsigned long)xfs_ag_block_count(mp,
+ pag_agno(pag)));
}
- if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ if (ino != xfs_agino_to_ino(pag, agino)) {
xfs_alert(mp,
- "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+ "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
__func__, ino,
- XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+ xfs_agino_to_ino(pag, agino));
}
xfs_stack_trace();
#endif /* DEBUG */
@@ -2507,7 +2526,7 @@ xfs_imap(
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
+ imap->im_blkno = xfs_agbno_to_daddr(pag, agbno);
imap->im_len = XFS_FSB_TO_BB(mp, 1);
imap->im_boffset = (unsigned short)(offset <<
mp->m_sb.sb_inodelog);
@@ -2537,7 +2556,7 @@ out_map:
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
+ imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno);
imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
@@ -2733,13 +2752,13 @@ xfs_read_agi(
xfs_buf_flags_t flags,
struct xfs_buf **agibpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
int error;
- trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
+ trace_xfs_read_agi(pag);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
@@ -2767,7 +2786,7 @@ xfs_ialloc_read_agi(
struct xfs_agi *agi;
int error;
- trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
+ trace_xfs_ialloc_read_agi(pag);
error = xfs_read_agi(pag, tp,
(flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
@@ -2787,7 +2806,7 @@ xfs_ialloc_read_agi(
* we are in the middle of a forced shutdown.
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(pag->pag_mount));
+ xfs_is_shutdown(pag_mount(pag)));
if (agibpp)
*agibpp = agibp;
else
@@ -2887,7 +2906,7 @@ xfs_ialloc_count_inodes_rec(
xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
- fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, &irec);
@@ -3126,13 +3145,13 @@ xfs_ialloc_check_shrink(
int has;
int error;
- if (!xfs_has_sparseinodes(pag->pag_mount))
+ if (!xfs_has_sparseinodes(pag_mount(pag)))
return 0;
cur = xfs_inobt_init_cursor(pag, tp, agibp);
/* Look up the inobt record that would correspond to the new EOFS. */
- agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
+ agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
if (error || !has)
goto out;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 401b42d52af6..6f270d8f4270 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -37,7 +37,7 @@ STATIC struct xfs_btree_cur *
xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ return xfs_inobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp,
cur->bc_ag.agbp);
}
@@ -45,7 +45,7 @@ STATIC struct xfs_btree_cur *
xfs_finobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ return xfs_finobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp,
cur->bc_ag.agbp);
}
@@ -112,7 +112,7 @@ __xfs_inobt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.pag = cur->bc_ag.pag;
+ args.pag = to_perag(cur->bc_group);
args.oinfo = XFS_RMAP_OINFO_INOBT;
args.minlen = 1;
args.maxlen = 1;
@@ -120,7 +120,7 @@ __xfs_inobt_alloc_block(
args.resv = resv;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno));
+ xfs_agbno_to_fsb(args.pag, sbno));
if (error)
return error;
@@ -248,7 +248,7 @@ xfs_inobt_init_ptr_from_cur(
{
struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_root;
}
@@ -260,7 +260,8 @@ xfs_finobt_init_ptr_from_cur(
{
struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno));
+
ptr->s = agi->agi_free_root;
}
@@ -478,12 +479,12 @@ xfs_inobt_init_cursor(
struct xfs_trans *tp,
struct xfs_buf *agbp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agi *agi = agbp->b_addr;
@@ -504,12 +505,12 @@ xfs_finobt_init_cursor(
struct xfs_trans *tp,
struct xfs_buf *agbp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agi *agi = agbp->b_addr;
@@ -715,8 +716,8 @@ static xfs_extlen_t
xfs_inobt_max_size(
struct xfs_perag *pag)
{
- struct xfs_mount *mp = pag->pag_mount;
- xfs_agblock_t agblocks = pag->block_count;
+ struct xfs_mount *mp = pag_mount(pag);
+ xfs_agblock_t agblocks = pag_group(pag)->xg_block_count;
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (M_IGEO(mp)->inobt_mxr[0] == 0)
@@ -727,7 +728,7 @@ xfs_inobt_max_size(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (xfs_ag_contains_log(mp, pag->pag_agno))
+ if (xfs_ag_contains_log(mp, pag_agno(pag)))
agblocks -= mp->m_sb.sb_logblocks;
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
@@ -743,6 +744,7 @@ xfs_finobt_count_blocks(
{
struct xfs_buf *agbp = NULL;
struct xfs_btree_cur *cur;
+ xfs_filblks_t blocks;
int error;
error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
@@ -750,9 +752,10 @@ xfs_finobt_count_blocks(
return error;
cur = xfs_finobt_init_cursor(pag, tp, agbp);
- error = xfs_btree_count_blocks(cur, tree_blocks);
+ error = xfs_btree_count_blocks(cur, &blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
+ *tree_blocks = blocks;
return error;
}
@@ -791,10 +794,10 @@ xfs_finobt_calc_reserves(
xfs_extlen_t tree_len = 0;
int error;
- if (!xfs_has_finobt(pag->pag_mount))
+ if (!xfs_has_finobt(pag_mount(pag)))
return 0;
- if (xfs_has_inobtcounts(pag->pag_mount))
+ if (xfs_has_inobtcounts(pag_mount(pag)))
error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
error = xfs_finobt_count_blocks(pag, tp, &tree_len);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 79babeac9d75..aa13fc00afd7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -19,6 +19,7 @@
#include "xfs_ialloc.h"
#include "xfs_dir2.h"
#include "xfs_health.h"
+#include "xfs_metafile.h"
#include <linux/iversion.h>
@@ -136,7 +137,7 @@ xfs_imap_to_bp(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+ imap->im_len, 0, bpp, &xfs_inode_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
XFS_SICK_AG_INODES);
@@ -209,12 +210,15 @@ xfs_inode_from_disk(
* They will also be unconditionally written back to disk as v2 inodes.
*/
if (unlikely(from->di_version == 1)) {
- set_nlink(inode, be16_to_cpu(from->di_onlink));
+ /* di_metatype used to be di_onlink */
+ set_nlink(inode, be16_to_cpu(from->di_metatype));
ip->i_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
be16_to_cpu(from->di_projid_lo);
+ if (xfs_dinode_is_metadir(from))
+ ip->i_metatype = be16_to_cpu(from->di_metatype);
}
i_uid_write(inode, be32_to_cpu(from->di_uid));
@@ -248,7 +252,10 @@ xfs_inode_from_disk(
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+ BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
+ sizeof(from->di_used_blocks));
}
error = xfs_iformat_data_fork(ip, from);
@@ -315,7 +322,10 @@ xfs_inode_to_disk(
struct inode *inode = VFS_I(ip);
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- to->di_onlink = 0;
+ if (xfs_is_metadir_inode(ip))
+ to->di_metatype = cpu_to_be16(ip->i_metatype);
+ else
+ to->di_metatype = 0;
to->di_format = xfs_ifork_format(&ip->i_df);
to->di_uid = cpu_to_be32(i_uid_read(inode));
@@ -342,6 +352,7 @@ xfs_inode_to_disk(
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
@@ -434,6 +445,30 @@ xfs_dinode_verify_fork(
if (di_nextents > max_extents)
return __this_address;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (!xfs_has_metadir(mp))
+ return __this_address;
+ if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)))
+ return __this_address;
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ /*
+ * growfs must create the rtrmap inodes before adding a
+ * realtime volume to the filesystem, so we cannot use
+ * the rtrmapbt predicate here.
+ */
+ if (!xfs_has_rmapbt(mp))
+ return __this_address;
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ /* same comment about growfs and rmap inodes applies */
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ break;
+ default:
+ return __this_address;
+ }
+ break;
default:
return __this_address;
}
@@ -453,6 +488,10 @@ xfs_dinode_verify_forkoff(
if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
return __this_address;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (!xfs_has_metadir(mp) || !xfs_has_parent(mp))
+ return __this_address;
+ fallthrough;
case XFS_DINODE_FMT_LOCAL: /* fall through ... */
case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
case XFS_DINODE_FMT_BTREE:
@@ -483,6 +522,69 @@ xfs_dinode_verify_nrext64(
return NULL;
}
+/*
+ * Validate all the picky requirements we have for a file that claims to be
+ * filesystem metadata.
+ */
+xfs_failaddr_t
+xfs_dinode_verify_metadir(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ if (!xfs_has_metadir(mp))
+ return __this_address;
+
+ /* V5 filesystem only */
+ if (dip->di_version < 3)
+ return __this_address;
+
+ if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
+ return __this_address;
+
+ /* V3 inode fields that are always zero */
+ if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad)
+ return __this_address;
+ if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter)
+ return __this_address;
+
+ /* Metadata files can only be directories or regular files */
+ if (!S_ISDIR(mode) && !S_ISREG(mode))
+ return __this_address;
+
+ /* They must have zero access permissions */
+ if (mode & 0777)
+ return __this_address;
+
+ /* DMAPI event and state masks are zero */
+ if (dip->di_dmevmask || dip->di_dmstate)
+ return __this_address;
+
+ /*
+ * User and group IDs must be zero. The project ID is used for
+ * grouping inodes. Metadata inodes are never accounted to quotas.
+ */
+ if (dip->di_uid || dip->di_gid)
+ return __this_address;
+
+ /* Mandatory inode flags must be set */
+ if (S_ISDIR(mode)) {
+ if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS)
+ return __this_address;
+ } else {
+ if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS)
+ return __this_address;
+ }
+
+ /* dax flags2 must not be set */
+ if (flags2 & XFS_DIFLAG2_DAX)
+ return __this_address;
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -523,8 +625,11 @@ xfs_dinode_verify(
* di_nlink==0 on a V1 inode. V2/3 inodes would get written out with
* di_onlink==0, so we can check that.
*/
- if (dip->di_version >= 2) {
- if (dip->di_onlink)
+ if (dip->di_version == 2) {
+ if (dip->di_metatype)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (!xfs_dinode_is_metadir(dip) && dip->di_metatype)
return __this_address;
}
@@ -546,7 +651,8 @@ xfs_dinode_verify(
if (dip->di_nlink)
return __this_address;
} else {
- if (dip->di_onlink)
+ /* di_metatype used to be di_onlink */
+ if (dip->di_metatype)
return __this_address;
}
}
@@ -563,9 +669,6 @@ xfs_dinode_verify(
if (mode && nextents + naextents > nblocks)
return __this_address;
- if (nextents + naextents == 0 && nblocks != 0)
- return __this_address;
-
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
@@ -649,20 +752,40 @@ xfs_dinode_verify(
return __this_address;
/* don't let reflink and realtime mix */
- if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) &&
+ !xfs_has_rtreflink(mp))
return __this_address;
- /* COW extent size hint validation */
- fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
- mode, flags, flags2);
- if (fa)
- return fa;
+ if (xfs_has_zoned(mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
+ if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
+ return __this_address;
+ } else {
+ /* COW extent size hint validation */
+ fa = xfs_inode_validate_cowextsize(mp,
+ be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa)
+ return fa;
+ }
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&
!xfs_has_bigtime(mp))
return __this_address;
+ if (flags2 & XFS_DIFLAG2_METADATA) {
+ fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2);
+ if (fa)
+ return fa;
+ }
+
+ /* metadata inodes containing btrees always have zero extent count */
+ if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) {
+ if (nextents + naextents == 0 && nblocks != 0)
+ return __this_address;
+ }
+
return NULL;
}
@@ -798,11 +921,29 @@ xfs_inode_validate_cowextsize(
bool rt_flag;
bool hint_flag;
uint32_t cowextsize_bytes;
+ uint32_t blocksize_bytes;
rt_flag = (flags & XFS_DIFLAG_REALTIME);
hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
+ /*
+ * Similar to extent size hints, a directory can be configured to
+ * propagate realtime status and a CoW extent size hint to newly
+ * created files even if there is no realtime device, and the hints on
+ * disk can become misaligned if the sysadmin changes the rt extent
+ * size while adding the realtime device.
+ *
+ * Therefore, we can only enforce the rextsize alignment check against
+ * regular realtime files, and rely on callers to decide when alignment
+ * checks are appropriate, and fix things up as needed.
+ */
+
+ if (rt_flag)
+ blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
if (hint_flag && !xfs_has_reflink(mp))
return __this_address;
@@ -816,16 +957,13 @@ xfs_inode_validate_cowextsize(
if (mode && !hint_flag && cowextsize != 0)
return __this_address;
- if (hint_flag && rt_flag)
- return __this_address;
-
- if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+ if (cowextsize_bytes % blocksize_bytes)
return __this_address;
if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
- if (cowextsize > mp->m_sb.sb_agblocks / 2)
+ if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2)
return __this_address;
return NULL;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 585ed5a110af..8d43d2641c73 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -28,6 +28,9 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
+xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp,
+ struct xfs_dinode *dip, uint16_t mode, uint16_t flags,
+ uint64_t flags2);
xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
uint32_t extsize, uint16_t mode, uint16_t flags);
xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1158ca48626b..4f99b90add55 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -27,6 +27,8 @@
#include "xfs_errortag.h"
#include "xfs_health.h"
#include "xfs_symlink_remote.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
struct kmem_cache *xfs_ifork_cache;
@@ -178,7 +180,7 @@ xfs_iformat_btree(
struct xfs_mount *mp = ip->i_mount;
xfs_bmdr_block_t *dfp;
struct xfs_ifork *ifp;
- /* REFERENCED */
+ struct xfs_btree_block *broot;
int nrecs;
int size;
int level;
@@ -211,16 +213,13 @@ xfs_iformat_btree(
return -EFSCORRUPTED;
}
- ifp->if_broot_bytes = size;
- ifp->if_broot = kmalloc(size,
- GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
- ASSERT(ifp->if_broot != NULL);
+ broot = xfs_broot_alloc(ifp, size);
/*
* Copy and convert from the on-disk structure
* to the in-memory structure.
*/
xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
- ifp->if_broot, size);
+ broot, size);
ifp->if_bytes = 0;
ifp->if_data = NULL;
@@ -270,6 +269,16 @@ xfs_iformat_data_fork(
return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
case XFS_DINODE_FMT_BTREE:
return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ case XFS_DINODE_FMT_META_BTREE:
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ return xfs_iformat_rtrmap(ip, dip);
+ case XFS_METAFILE_RTREFCOUNT:
+ return xfs_iformat_rtrefcount(ip, dip);
+ default:
+ break;
+ }
+ fallthrough;
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
dip, sizeof(*dip), __this_address);
@@ -363,135 +372,68 @@ xfs_iformat_attr_fork(
}
/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff. Move the records
- * and pointers in if_broot to fit the new size. When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller. When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root. If the if_broot is currently NULL, then
- * if we are adding records, one will be allocated. The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- * requested for the if_broot array.
+ * Allocate the if_broot component of an inode fork so that it is @new_size
+ * bytes in size, using __GFP_NOLOCKDEP like all the other code that
+ * initializes a broot during inode load. Returns if_broot.
*/
-void
-xfs_iroot_realloc(
- xfs_inode_t *ip,
- int rec_diff,
- int whichfork)
+struct xfs_btree_block *
+xfs_broot_alloc(
+ struct xfs_ifork *ifp,
+ size_t new_size)
{
- struct xfs_mount *mp = ip->i_mount;
- int cur_max;
- struct xfs_ifork *ifp;
- struct xfs_btree_block *new_broot;
- int new_max;
- size_t new_size;
- char *np;
- char *op;
+ ASSERT(ifp->if_broot == NULL);
- /*
- * Handle the degenerate case quietly.
- */
- if (rec_diff == 0) {
- return;
- }
+ ifp->if_broot = kmalloc(new_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ ifp->if_broot_bytes = new_size;
+ return ifp->if_broot;
+}
- ifp = xfs_ifork_ptr(ip, whichfork);
- if (rec_diff > 0) {
- /*
- * If there wasn't any memory allocated before, just
- * allocate it now and get out.
- */
- if (ifp->if_broot_bytes == 0) {
- new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
- ifp->if_broot = kmalloc(new_size,
- GFP_KERNEL | __GFP_NOFAIL);
- ifp->if_broot_bytes = (int)new_size;
- return;
- }
+/*
+ * Reallocate the if_broot component of an inode fork so that it is @new_size
+ * bytes in size. Returns if_broot.
+ */
+struct xfs_btree_block *
+xfs_broot_realloc(
+ struct xfs_ifork *ifp,
+ size_t new_size)
+{
+ /* No size change? No action needed. */
+ if (new_size == ifp->if_broot_bytes)
+ return ifp->if_broot;
- /*
- * If there is already an existing if_broot, then we need
- * to realloc() it and shift the pointers to their new
- * location. The records don't change location because
- * they are kept butted up against the btree block header.
- */
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
- new_max = cur_max + rec_diff;
- new_size = xfs_bmap_broot_space_calc(mp, new_max);
- ifp->if_broot = krealloc(ifp->if_broot, new_size,
- GFP_KERNEL | __GFP_NOFAIL);
- op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
- (int)new_size);
- ifp->if_broot_bytes = (int)new_size;
- ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
- xfs_inode_fork_size(ip, whichfork));
- memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
- return;
+ /* New size is zero, free it. */
+ if (new_size == 0) {
+ ifp->if_broot_bytes = 0;
+ kfree(ifp->if_broot);
+ ifp->if_broot = NULL;
+ return NULL;
}
/*
- * rec_diff is less than 0. In this case, we are shrinking the
- * if_broot buffer. It must already exist. If we go to zero
- * records, just get rid of the root and clear the status bit.
+ * Shrinking the iroot means we allocate a new smaller object and copy
+ * it. We don't trust krealloc not to nop on realloc-down.
*/
- ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
- new_max = cur_max + rec_diff;
- ASSERT(new_max >= 0);
- if (new_max > 0)
- new_size = xfs_bmap_broot_space_calc(mp, new_max);
- else
- new_size = 0;
- if (new_size > 0) {
- new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
- /*
- * First copy over the btree block header.
- */
- memcpy(new_broot, ifp->if_broot,
- xfs_bmbt_block_len(ip->i_mount));
- } else {
- new_broot = NULL;
+ if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) {
+ struct xfs_btree_block *old_broot = ifp->if_broot;
+
+ ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
+ ifp->if_broot_bytes = new_size;
+ memcpy(ifp->if_broot, old_broot, new_size);
+ kfree(old_broot);
+ return ifp->if_broot;
}
/*
- * Only copy the keys and pointers if there are any.
+ * Growing the iroot means we can krealloc. This may get us the same
+ * object.
*/
- if (new_max > 0) {
- /*
- * First copy the keys.
- */
- op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
- np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
- memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
-
- /*
- * Then copy the pointers.
- */
- op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
- (int)new_size);
- memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
- }
- kfree(ifp->if_broot);
- ifp->if_broot = new_broot;
- ifp->if_broot_bytes = (int)new_size;
- if (ifp->if_broot)
- ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
- xfs_inode_fork_size(ip, whichfork));
- return;
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_KERNEL | __GFP_NOFAIL);
+ ifp->if_broot_bytes = new_size;
+ return ifp->if_broot;
}
-
/*
* This is called when the amount of space needed for if_data
* is increased or decreased. The change in size is indicated by
@@ -671,6 +613,25 @@ xfs_iflush_fork(
}
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ ASSERT(whichfork == XFS_DATA_FORK);
+
+ if (!(iip->ili_fields & brootflag[whichfork]))
+ break;
+
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ xfs_iflush_rtrmap(ip, dip);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ xfs_iflush_rtrefcount(ip, dip);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ break;
+
default:
ASSERT(0);
break;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 2373d12fd474..69ed0919d60b 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -170,7 +170,11 @@ void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
void xfs_idestroy_fork(struct xfs_ifork *ifp);
void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
-void xfs_iroot_realloc(struct xfs_inode *, int, int);
+struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp,
+ size_t new_size);
+struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp,
+ size_t new_size);
+
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index cc38e1c3c3e1..48fe49a5f050 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -224,6 +224,8 @@ xfs_inode_inherit_flags2(
}
if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+ if (xfs_is_metadir_inode(pip))
+ ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
/* Don't let invalid cowextsize hints propagate. */
failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
@@ -320,6 +322,7 @@ xfs_inode_init(
if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = 0;
times |= XFS_ICHGTIME_CREATE;
}
@@ -442,8 +445,8 @@ xfs_iunlink_update_bucket(
ASSERT(xfs_verify_agino_or_null(pag, new_agino));
old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
- old_value, new_agino);
+ trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value,
+ new_agino);
/*
* We should never find the head of the list already set to the value
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 3e6682ed656b..0d637c276db0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -248,6 +248,12 @@ typedef struct xfs_trans_header {
#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_XMI 0x1248 /* mapping exchange intent */
#define XFS_LI_XMD 0x1249 /* mapping exchange done */
+#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */
+#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */
+#define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */
+#define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */
+#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */
+#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -267,7 +273,13 @@ typedef struct xfs_trans_header {
{ XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
{ XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \
{ XFS_LI_XMI, "XFS_LI_XMI" }, \
- { XFS_LI_XMD, "XFS_LI_XMD" }
+ { XFS_LI_XMD, "XFS_LI_XMD" }, \
+ { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \
+ { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \
+ { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \
+ { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \
+ { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \
+ { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" }
/*
* Inode Log Item Format definitions.
@@ -347,12 +359,6 @@ struct xfs_inode_log_format_32 {
*/
#define XFS_ILOG_IVERSION 0x8000
-#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
- XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
- XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
- XFS_ILOG_AOWNER)
-
#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT)
@@ -404,7 +410,7 @@ struct xfs_log_dinode {
uint16_t di_mode; /* mode and type of file */
int8_t di_version; /* inode version */
int8_t di_format; /* format of di_c data */
- uint8_t di_pad3[2]; /* unused in v2/3 inodes */
+ uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */
uint32_t di_uid; /* owner's user id */
uint32_t di_gid; /* owner's group id */
uint32_t di_nlink; /* number of links to file */
@@ -469,7 +475,12 @@ struct xfs_log_dinode {
xfs_lsn_t di_lsn;
uint64_t di_flags2; /* more random flags */
- uint32_t di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ uint32_t di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ uint32_t di_used_blocks;
+ };
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 521d327e4c89..66c7916fb5cd 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -77,6 +77,12 @@ extern const struct xlog_recover_item_ops xlog_attri_item_ops;
extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcud_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index d3bd6a86c8fe..34bba96d30ca 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks(
*/
if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
xfs_trans_resv_calc(mp, resv);
+ resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
return;
}
@@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks(
xfs_trans_resv_calc(mp, resv);
+ /* Copy the dynamic transaction reservation types from the running fs */
+ resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
+
if (xfs_has_reflink(mp)) {
/*
* In the early days of reflink, typical log operation counts
diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c
new file mode 100644
index 000000000000..178e89711cb7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metadir.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_metafile.h"
+#include "xfs_metadir.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_parent.h"
+#include "xfs_health.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+
+/*
+ * Metadata Directory Tree
+ * =======================
+ *
+ * These functions provide an abstraction layer for looking up, creating, and
+ * deleting metadata inodes that live within a special metadata directory tree.
+ *
+ * This code does not manage the five existing metadata inodes: real time
+ * bitmap & summary; and the user, group, and quotas. All other metadata
+ * inodes must use only the xfs_meta{dir,file}_* functions.
+ *
+ * Callers wishing to create or hardlink a metadata inode must create an
+ * xfs_metadir_update structure, call the appropriate xfs_metadir* function,
+ * and then call xfs_metadir_commit or xfs_metadir_cancel to commit or cancel
+ * the update. Files in the metadata directory tree currently cannot be
+ * unlinked.
+ *
+ * When the metadir feature is enabled, all metadata inodes must have the
+ * "metadata" inode flag set to prevent them from being exposed to the outside
+ * world.
+ *
+ * Callers must take the ILOCK of any inode in the metadata directory tree to
+ * synchronize access to that inode. It is never necessary to take the IOLOCK
+ * or the MMAPLOCK since metadata inodes must not be exposed to user space.
+ */
+
+static inline void
+xfs_metadir_set_xname(
+ struct xfs_name *xname,
+ const char *path,
+ unsigned char ftype)
+{
+ xname->name = (const unsigned char *)path;
+ xname->len = strlen(path);
+ xname->type = ftype;
+}
+
+/*
+ * Given a parent directory @dp and a metadata inode path component @xname,
+ * Look up the inode number in the directory, returning it in @ino.
+ * @xname.type must match the directory entry's ftype.
+ *
+ * Caller must hold ILOCK_EXCL.
+ */
+static inline int
+xfs_metadir_lookup(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_name *xname,
+ xfs_ino_t *ino)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args = {
+ .trans = tp,
+ .dp = dp,
+ .geo = mp->m_dir_geo,
+ .name = xname->name,
+ .namelen = xname->len,
+ .hashval = xfs_dir2_hashname(mp, xname),
+ .whichfork = XFS_DATA_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ .owner = dp->i_ino,
+ };
+ int error;
+
+ if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_dir_lookup_args(&args);
+ if (error)
+ return error;
+
+ if (!xfs_verify_ino(mp, args.inumber)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+ if (xname->type != XFS_DIR3_FT_UNKNOWN && xname->type != args.filetype) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xfs_metadir_lookup(dp, xname, args.inumber);
+ *ino = args.inumber;
+ return 0;
+}
+
+/*
+ * Look up and read a metadata inode from the metadata directory. If the path
+ * component doesn't exist, return -ENOENT.
+ */
+int
+xfs_metadir_load(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const char *path,
+ enum xfs_metafile_type metafile_type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_name xname;
+ xfs_ino_t ino;
+ int error;
+
+ xfs_metadir_set_xname(&xname, path, XFS_DIR3_FT_UNKNOWN);
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+ error = xfs_metadir_lookup(tp, dp, &xname, &ino);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+ return xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
+}
+
+/*
+ * Unlock and release resources after committing (or cancelling) a metadata
+ * directory tree operation. The caller retains its reference to @upd->ip
+ * and must release it explicitly.
+ */
+static inline void
+xfs_metadir_teardown(
+ struct xfs_metadir_update *upd,
+ int error)
+{
+ trace_xfs_metadir_teardown(upd, error);
+
+ if (upd->ppargs) {
+ xfs_parent_finish(upd->dp->i_mount, upd->ppargs);
+ upd->ppargs = NULL;
+ }
+
+ if (upd->ip) {
+ if (upd->ip_locked)
+ xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+ upd->ip_locked = false;
+ }
+
+ if (upd->dp_locked)
+ xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+ upd->dp_locked = false;
+}
+
+/*
+ * Begin the process of creating a metadata file by allocating transactions
+ * and taking whatever resources we're going to need.
+ */
+int
+xfs_metadir_start_create(
+ struct xfs_metadir_update *upd)
+{
+ struct xfs_mount *mp = upd->dp->i_mount;
+ int error;
+
+ ASSERT(upd->dp != NULL);
+ ASSERT(upd->ip == NULL);
+ ASSERT(xfs_has_metadir(mp));
+ ASSERT(upd->metafile_type != XFS_METAFILE_UNKNOWN);
+
+ error = xfs_parent_start(mp, &upd->ppargs);
+ if (error)
+ return error;
+
+ /*
+ * If we ever need the ability to create rt metadata files on a
+ * pre-metadir filesystem, we'll need to dqattach the parent here.
+ * Currently we assume that mkfs will create the files and quotacheck
+ * will account for them.
+ */
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+ xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Lock the parent directory if there is one. We can't ijoin it to
+ * the transaction until after the child file has been created.
+ */
+ xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ upd->dp_locked = true;
+
+ trace_xfs_metadir_start_create(upd);
+ return 0;
+out_teardown:
+ xfs_metadir_teardown(upd, error);
+ return error;
+}
+
+/*
+ * Create a metadata inode with the given @mode, and insert it into the
+ * metadata directory tree at the given @upd->path. The path up to the final
+ * component must already exist. The final path component must not exist.
+ *
+ * The new metadata inode will be attached to the update structure @upd->ip,
+ * with the ILOCK held until the caller releases it.
+ *
+ * NOTE: This function may return a new inode to the caller even if it returns
+ * a negative error code. If an inode is passed back, the caller must finish
+ * setting up the inode before releasing it.
+ */
+int
+xfs_metadir_create(
+ struct xfs_metadir_update *upd,
+ umode_t mode)
+{
+ struct xfs_icreate_args args = {
+ .pip = upd->dp,
+ .mode = mode,
+ };
+ struct xfs_name xname;
+ struct xfs_dir_update du = {
+ .dp = upd->dp,
+ .name = &xname,
+ .ppargs = upd->ppargs,
+ };
+ struct xfs_mount *mp = upd->dp->i_mount;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ int error;
+
+ xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL);
+
+ /* Check that the name does not already exist in the directory. */
+ xfs_metadir_set_xname(&xname, upd->path, XFS_DIR3_FT_UNKNOWN);
+ error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino);
+ switch (error) {
+ case -ENOENT:
+ break;
+ case 0:
+ error = -EEXIST;
+ fallthrough;
+ default:
+ return error;
+ }
+
+ /*
+ * A newly created regular or special file just has one directory
+ * entry pointing to them, but a directory also the "." entry
+ * pointing to itself.
+ */
+ error = xfs_dialloc(&upd->tp, &args, &ino);
+ if (error)
+ return error;
+ error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
+ if (error)
+ return error;
+ du.ip = upd->ip;
+ xfs_metafile_set_iflag(upd->tp, upd->ip, upd->metafile_type);
+ upd->ip_locked = true;
+
+ /*
+ * Join the directory inode to the transaction. We do not do it
+ * earlier because xfs_dialloc rolls the transaction.
+ */
+ xfs_trans_ijoin(upd->tp, upd->dp, 0);
+
+ /* Create the entry. */
+ if (S_ISDIR(args.mode))
+ resblks = xfs_mkdir_space_res(mp, xname.len);
+ else
+ resblks = xfs_create_space_res(mp, xname.len);
+ xname.type = xfs_mode_to_ftype(args.mode);
+
+ trace_xfs_metadir_try_create(upd);
+
+ error = xfs_dir_create_child(upd->tp, resblks, &du);
+ if (error)
+ return error;
+
+ /* Metadir files are not accounted to quota. */
+
+ trace_xfs_metadir_create(upd);
+
+ return 0;
+}
+
+#ifndef __KERNEL__
+/*
+ * Begin the process of linking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+int
+xfs_metadir_start_link(
+ struct xfs_metadir_update *upd)
+{
+ struct xfs_mount *mp = upd->dp->i_mount;
+ unsigned int resblks;
+ int nospace_error = 0;
+ int error;
+
+ ASSERT(upd->dp != NULL);
+ ASSERT(upd->ip != NULL);
+ ASSERT(xfs_has_metadir(mp));
+
+ error = xfs_parent_start(mp, &upd->ppargs);
+ if (error)
+ return error;
+
+ resblks = xfs_link_space_res(mp, MAXNAMELEN);
+ error = xfs_trans_alloc_dir(upd->dp, &M_RES(mp)->tr_link, upd->ip,
+ &resblks, &upd->tp, &nospace_error);
+ if (error)
+ goto out_teardown;
+ if (!resblks) {
+ /* We don't allow reservationless updates. */
+ xfs_trans_cancel(upd->tp);
+ upd->tp = NULL;
+ xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+ error = nospace_error;
+ goto out_teardown;
+ }
+
+ upd->dp_locked = true;
+ upd->ip_locked = true;
+
+ trace_xfs_metadir_start_link(upd);
+ return 0;
+out_teardown:
+ xfs_metadir_teardown(upd, error);
+ return error;
+}
+
+/*
+ * Link the metadata directory given by @path to the inode @upd->ip.
+ * The path (up to the final component) must already exist, but the final
+ * component must not already exist.
+ */
+int
+xfs_metadir_link(
+ struct xfs_metadir_update *upd)
+{
+ struct xfs_name xname;
+ struct xfs_dir_update du = {
+ .dp = upd->dp,
+ .name = &xname,
+ .ip = upd->ip,
+ .ppargs = upd->ppargs,
+ };
+ struct xfs_mount *mp = upd->dp->i_mount;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ int error;
+
+ xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(upd->ip, XFS_ILOCK_EXCL);
+
+ /* Look up the name in the current directory. */
+ xfs_metadir_set_xname(&xname, upd->path,
+ xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode));
+ error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino);
+ switch (error) {
+ case -ENOENT:
+ break;
+ case 0:
+ error = -EEXIST;
+ fallthrough;
+ default:
+ return error;
+ }
+
+ resblks = xfs_link_space_res(mp, xname.len);
+ error = xfs_dir_add_child(upd->tp, resblks, &du);
+ if (error)
+ return error;
+
+ trace_xfs_metadir_link(upd);
+
+ return 0;
+}
+#endif /* ! __KERNEL__ */
+
+/* Commit a metadir update and unlock/drop all resources. */
+int
+xfs_metadir_commit(
+ struct xfs_metadir_update *upd)
+{
+ int error;
+
+ trace_xfs_metadir_commit(upd);
+
+ error = xfs_trans_commit(upd->tp);
+ upd->tp = NULL;
+
+ xfs_metadir_teardown(upd, error);
+ return error;
+}
+
+/* Cancel a metadir update and unlock/drop all resources. */
+void
+xfs_metadir_cancel(
+ struct xfs_metadir_update *upd,
+ int error)
+{
+ trace_xfs_metadir_cancel(upd);
+
+ xfs_trans_cancel(upd->tp);
+ upd->tp = NULL;
+
+ xfs_metadir_teardown(upd, error);
+}
+
+/* Create a metadata for the last component of the path. */
+int
+xfs_metadir_mkdir(
+ struct xfs_inode *dp,
+ const char *path,
+ struct xfs_inode **ipp)
+{
+ struct xfs_metadir_update upd = {
+ .dp = dp,
+ .path = path,
+ .metafile_type = XFS_METAFILE_DIR,
+ };
+ int error;
+
+ if (xfs_is_shutdown(dp->i_mount))
+ return -EIO;
+
+ /* Allocate a transaction to create the last directory. */
+ error = xfs_metadir_start_create(&upd);
+ if (error)
+ return error;
+
+ /* Create the subdirectory and take our reference. */
+ error = xfs_metadir_create(&upd, S_IFDIR);
+ if (error)
+ goto out_cancel;
+
+ error = xfs_metadir_commit(&upd);
+ if (error)
+ goto out_irele;
+
+ xfs_finish_inode_setup(upd.ip);
+ *ipp = upd.ip;
+ return 0;
+
+out_cancel:
+ xfs_metadir_cancel(&upd, error);
+out_irele:
+ /* Have to finish setting up the inode to ensure it's deleted. */
+ if (upd.ip) {
+ xfs_finish_inode_setup(upd.ip);
+ xfs_irele(upd.ip);
+ }
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_metadir.h b/fs/xfs/libxfs/xfs_metadir.h
new file mode 100644
index 000000000000..bfecac7d3d14
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metadir.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_METADIR_H__
+#define __XFS_METADIR_H__
+
+/* Cleanup widget for metadata inode creation and deletion. */
+struct xfs_metadir_update {
+ /* Parent directory */
+ struct xfs_inode *dp;
+
+ /* Path to metadata file */
+ const char *path;
+
+ /* Parent pointer update context */
+ struct xfs_parent_args *ppargs;
+
+ /* Child metadata file */
+ struct xfs_inode *ip;
+
+ struct xfs_trans *tp;
+
+ enum xfs_metafile_type metafile_type;
+
+ unsigned int dp_locked:1;
+ unsigned int ip_locked:1;
+};
+
+int xfs_metadir_load(struct xfs_trans *tp, struct xfs_inode *dp,
+ const char *path, enum xfs_metafile_type metafile_type,
+ struct xfs_inode **ipp);
+
+int xfs_metadir_start_create(struct xfs_metadir_update *upd);
+int xfs_metadir_create(struct xfs_metadir_update *upd, umode_t mode);
+
+int xfs_metadir_start_link(struct xfs_metadir_update *upd);
+int xfs_metadir_link(struct xfs_metadir_update *upd);
+
+int xfs_metadir_commit(struct xfs_metadir_update *upd);
+void xfs_metadir_cancel(struct xfs_metadir_update *upd, int error);
+
+int xfs_metadir_mkdir(struct xfs_inode *dp, const char *path,
+ struct xfs_inode **ipp);
+
+#endif /* __XFS_METADIR_H__ */
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
new file mode 100644
index 000000000000..225923e463c4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_metafile.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
+
+static const struct {
+ enum xfs_metafile_type mtype;
+ const char *name;
+} xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR };
+
+const char *
+xfs_metafile_type_str(enum xfs_metafile_type metatype)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) {
+ if (xfs_metafile_type_strs[i].mtype == metatype)
+ return xfs_metafile_type_strs[i].name;
+ }
+
+ return NULL;
+}
+
+/* Set up an inode to be recognized as a metadata directory inode. */
+void
+xfs_metafile_set_iflag(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ enum xfs_metafile_type metafile_type)
+{
+ VFS_I(ip)->i_mode &= ~0777;
+ VFS_I(ip)->i_uid = GLOBAL_ROOT_UID;
+ VFS_I(ip)->i_gid = GLOBAL_ROOT_GID;
+ if (S_ISDIR(VFS_I(ip)->i_mode))
+ ip->i_diflags |= XFS_METADIR_DIFLAGS;
+ else
+ ip->i_diflags |= XFS_METAFILE_DIFLAGS;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+ ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
+ ip->i_metatype = metafile_type;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Clear the metadata directory inode flag. */
+void
+xfs_metafile_clear_iflag(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+
+ ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Is the metafile reservations at or beneath a certain threshold?
+ */
+static inline bool
+xfs_metafile_resv_can_cover(
+ struct xfs_mount *mp,
+ int64_t rhs)
+{
+ /*
+ * The amount of space that can be allocated to this metadata file is
+ * the remaining reservation for the particular metadata file + the
+ * global free block count. Take care of the first case to avoid
+ * touching the per-cpu counter.
+ */
+ if (mp->m_metafile_resv_avail >= rhs)
+ return true;
+
+ /*
+ * There aren't enough blocks left in the inode's reservation, but it
+ * isn't critical unless there also isn't enough free space.
+ */
+ return xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
+ rhs - mp->m_metafile_resv_avail, 2048) >= 0;
+}
+
+/*
+ * Is the metafile reservation critically low on blocks? For now we'll define
+ * that as the number of blocks we can get our hands on being less than 10% of
+ * what we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_metafile_resv_critical(
+ struct xfs_mount *mp)
+{
+ ASSERT(xfs_has_metadir(mp));
+
+ trace_xfs_metafile_resv_critical(mp, 0);
+
+ if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels))
+ return true;
+
+ if (!xfs_metafile_resv_can_cover(mp,
+ div_u64(mp->m_metafile_resv_target, 10)))
+ return true;
+
+ return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_metafile_resv_alloc_space(
+ struct xfs_inode *ip,
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int64_t len = args->len;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(args->resv == XFS_AG_RESV_METAFILE);
+
+ trace_xfs_metafile_resv_alloc_space(mp, args->len);
+
+ /*
+ * Allocate the blocks from the metadata inode's block reservation
+ * and update the ondisk sb counter.
+ */
+ mutex_lock(&mp->m_metafile_resv_lock);
+ if (mp->m_metafile_resv_avail > 0) {
+ int64_t from_resv;
+
+ from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail);
+ mp->m_metafile_resv_avail -= from_resv;
+ xfs_mod_delalloc(ip, 0, -from_resv);
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+ -from_resv);
+ len -= from_resv;
+ }
+
+ /*
+ * Any allocation in excess of the reservation requires in-core and
+ * on-disk fdblocks updates. If we can grab @len blocks from the
+ * in-core fdblocks then all we need to do is update the on-disk
+ * superblock; if not, then try to steal some from the transaction's
+ * block reservation. Overruns are only expected for rmap btrees.
+ */
+ if (len) {
+ unsigned int field;
+ int error;
+
+ error = xfs_dec_fdblocks(ip->i_mount, len, true);
+ if (error)
+ field = XFS_TRANS_SB_FDBLOCKS;
+ else
+ field = XFS_TRANS_SB_RES_FDBLOCKS;
+
+ xfs_trans_mod_sb(args->tp, field, -len);
+ }
+
+ mp->m_metafile_resv_used += args->len;
+ mutex_unlock(&mp->m_metafile_resv_lock);
+
+ ip->i_nblocks += args->len;
+ xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_metafile_resv_free_space(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int64_t to_resv;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ trace_xfs_metafile_resv_free_space(mp, len);
+
+ ip->i_nblocks -= len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ mutex_lock(&mp->m_metafile_resv_lock);
+ mp->m_metafile_resv_used -= len;
+
+ /*
+ * Add the freed blocks back into the inode's delalloc reservation
+ * until it reaches the maximum size. Update the ondisk fdblocks only.
+ */
+ to_resv = mp->m_metafile_resv_target -
+ (mp->m_metafile_resv_used + mp->m_metafile_resv_avail);
+ if (to_resv > 0) {
+ to_resv = min_t(int64_t, to_resv, len);
+ mp->m_metafile_resv_avail += to_resv;
+ xfs_mod_delalloc(ip, 0, to_resv);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+ len -= to_resv;
+ }
+ mutex_unlock(&mp->m_metafile_resv_lock);
+
+ /*
+ * Everything else goes back to the filesystem, so update the in-core
+ * and on-disk counters.
+ */
+ if (len)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+static void
+__xfs_metafile_resv_free(
+ struct xfs_mount *mp)
+{
+ if (mp->m_metafile_resv_avail) {
+ xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail);
+ xfs_add_fdblocks(mp, mp->m_metafile_resv_avail);
+ }
+ mp->m_metafile_resv_avail = 0;
+ mp->m_metafile_resv_used = 0;
+ mp->m_metafile_resv_target = 0;
+}
+
+/* Release unused metafile space reservation. */
+void
+xfs_metafile_resv_free(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_metadir(mp))
+ return;
+
+ trace_xfs_metafile_resv_free(mp, 0);
+
+ mutex_lock(&mp->m_metafile_resv_lock);
+ __xfs_metafile_resv_free(mp);
+ mutex_unlock(&mp->m_metafile_resv_lock);
+}
+
+/* Set up a metafile space reservation. */
+int
+xfs_metafile_resv_init(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg = NULL;
+ xfs_filblks_t used = 0, target = 0;
+ xfs_filblks_t hidden_space;
+ xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4;
+ int error = 0;
+
+ if (!xfs_has_metadir(mp))
+ return 0;
+
+ /*
+ * Free any previous reservation to have a clean slate.
+ */
+ mutex_lock(&mp->m_metafile_resv_lock);
+ __xfs_metafile_resv_free(mp);
+
+ /*
+ * Currently the only btree metafiles that require reservations are the
+ * rtrmap and the rtrefcount. Anything new will have to be added here
+ * as well.
+ */
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ if (xfs_has_rtrmapbt(mp)) {
+ used += rtg_rmap(rtg)->i_nblocks;
+ target += xfs_rtrmapbt_calc_reserves(mp);
+ }
+ if (xfs_has_rtreflink(mp)) {
+ used += rtg_refcount(rtg)->i_nblocks;
+ target += xfs_rtrefcountbt_calc_reserves(mp);
+ }
+ }
+
+ if (!target)
+ goto out_unlock;
+
+ /*
+ * Space taken by the per-AG metadata btrees are accounted on-disk as
+ * used space. We therefore only hide the space that is reserved but
+ * not used by the trees.
+ */
+ if (used > target)
+ target = used;
+ else if (target > dblocks_avail)
+ target = dblocks_avail;
+ hidden_space = target - used;
+
+ error = xfs_dec_fdblocks(mp, hidden_space, true);
+ if (error) {
+ trace_xfs_metafile_resv_init_error(mp, 0);
+ goto out_unlock;
+ }
+
+ xfs_mod_sb_delalloc(mp, hidden_space);
+
+ mp->m_metafile_resv_target = target;
+ mp->m_metafile_resv_used = used;
+ mp->m_metafile_resv_avail = hidden_space;
+
+ trace_xfs_metafile_resv_init(mp, target);
+
+out_unlock:
+ mutex_unlock(&mp->m_metafile_resv_lock);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
new file mode 100644
index 000000000000..ae6f9e779b98
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_METAFILE_H__
+#define __XFS_METAFILE_H__
+
+const char *xfs_metafile_type_str(enum xfs_metafile_type metatype);
+
+/* All metadata files must have these flags set. */
+#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \
+ XFS_DIFLAG_SYNC | \
+ XFS_DIFLAG_NOATIME | \
+ XFS_DIFLAG_NODUMP | \
+ XFS_DIFLAG_NODEFRAG)
+
+/* All metadata directories must have these flags set. */
+#define XFS_METADIR_DIFLAGS (XFS_METAFILE_DIFLAGS | \
+ XFS_DIFLAG_NOSYMLINKS)
+
+void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip,
+ enum xfs_metafile_type metafile_type);
+void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
+
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_metafile_resv_critical(struct xfs_mount *mp);
+void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
+ struct xfs_alloc_arg *args);
+void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
+ xfs_filblks_t len);
+void xfs_metafile_resv_free(struct xfs_mount *mp);
+int xfs_metafile_resv_init(struct xfs_mount *mp);
+
+/* Code specific to kernel/userspace; must be provided externally. */
+
+int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino,
+ enum xfs_metafile_type metafile_type, struct xfs_inode **ipp);
+int xfs_metafile_iget(struct xfs_mount *mp, xfs_ino_t ino,
+ enum xfs_metafile_type metafile_type, struct xfs_inode **ipp);
+
+#endif /* __XFS_METAFILE_H__ */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 23c133fd36f5..5ed44fdf7491 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -19,40 +19,46 @@
static_assert((value) == (expected), \
"XFS: value of " #value " is wrong, expected " #expected)
+#define XFS_CHECK_SB_OFFSET(field, offset) \
+ XFS_CHECK_OFFSET(struct xfs_dsb, field, offset); \
+ XFS_CHECK_OFFSET(struct xfs_sb, field, offset);
+
static inline void __init
xfs_check_ondisk_structs(void)
{
- /* ag/file structures */
+ /* file structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
- XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
- XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
- XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
- XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
+ XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
+
+ /* space btrees */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_alloc_rec, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
- XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
- XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_bmdr_key_t, 8);
/* dir/attr trees */
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
@@ -67,33 +73,38 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4);
/* realtime structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb, 56);
XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4);
XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48);
+ XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4);
/*
- * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
- * 4 bytes anyway so it's not obviously a problem. Hence for the moment
- * we don't check this structure. This can be re-instated when the attr
- * definitions are updated to use c99 VLA definitions.
+ * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
+ * it to 4 bytes anyway so it's not obviously a problem. Hence for the
+ * moment we don't check this structure. This can be re-instated when
+ * the attr definitions are updated to use c99 VLA definitions.
*
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_remote, 12);
*/
- XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc, 224);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, valuelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, namelen, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, nameval, 3);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valueblk, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valuelen, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, namelen, 8);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, name, 9);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leafblock, 32);
XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4);
XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0);
XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2);
@@ -101,27 +112,41 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1);
XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2);
XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3);
- XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
- XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
- XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0);
- XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
- XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0);
- XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
- XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_blkinfo, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16);
+ XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3);
+ XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, namelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, offset, 1);
+ XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, name, 3);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10);
XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12);
+ /* ondisk dir/attr structures from xfs/122 */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10);
+
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
@@ -157,6 +182,11 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
+ /* ondisk log structures from xfs/122 */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_unmount_log_format, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_xmd_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_xmi_log_format, 88);
+
/* parent pointer ioctls */
XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32);
XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40);
@@ -201,6 +231,72 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
16299260424LL);
+
+ /* superblock field checks we got from xfs/122 */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
+ XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
+ XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
+ XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
+ XFS_CHECK_SB_OFFSET(sb_rblocks, 16);
+ XFS_CHECK_SB_OFFSET(sb_rextents, 24);
+ XFS_CHECK_SB_OFFSET(sb_uuid, 32);
+ XFS_CHECK_SB_OFFSET(sb_logstart, 48);
+ XFS_CHECK_SB_OFFSET(sb_rootino, 56);
+ XFS_CHECK_SB_OFFSET(sb_rbmino, 64);
+ XFS_CHECK_SB_OFFSET(sb_rsumino, 72);
+ XFS_CHECK_SB_OFFSET(sb_rextsize, 80);
+ XFS_CHECK_SB_OFFSET(sb_agblocks, 84);
+ XFS_CHECK_SB_OFFSET(sb_agcount, 88);
+ XFS_CHECK_SB_OFFSET(sb_rbmblocks, 92);
+ XFS_CHECK_SB_OFFSET(sb_logblocks, 96);
+ XFS_CHECK_SB_OFFSET(sb_versionnum, 100);
+ XFS_CHECK_SB_OFFSET(sb_sectsize, 102);
+ XFS_CHECK_SB_OFFSET(sb_inodesize, 104);
+ XFS_CHECK_SB_OFFSET(sb_inopblock, 106);
+ XFS_CHECK_SB_OFFSET(sb_blocklog, 120);
+ XFS_CHECK_SB_OFFSET(sb_fname[12], 120);
+ XFS_CHECK_SB_OFFSET(sb_sectlog, 121);
+ XFS_CHECK_SB_OFFSET(sb_inodelog, 122);
+ XFS_CHECK_SB_OFFSET(sb_inopblog, 123);
+ XFS_CHECK_SB_OFFSET(sb_agblklog, 124);
+ XFS_CHECK_SB_OFFSET(sb_rextslog, 125);
+ XFS_CHECK_SB_OFFSET(sb_inprogress, 126);
+ XFS_CHECK_SB_OFFSET(sb_imax_pct, 127);
+ XFS_CHECK_SB_OFFSET(sb_icount, 128);
+ XFS_CHECK_SB_OFFSET(sb_ifree, 136);
+ XFS_CHECK_SB_OFFSET(sb_fdblocks, 144);
+ XFS_CHECK_SB_OFFSET(sb_frextents, 152);
+ XFS_CHECK_SB_OFFSET(sb_uquotino, 160);
+ XFS_CHECK_SB_OFFSET(sb_gquotino, 168);
+ XFS_CHECK_SB_OFFSET(sb_qflags, 176);
+ XFS_CHECK_SB_OFFSET(sb_flags, 178);
+ XFS_CHECK_SB_OFFSET(sb_shared_vn, 179);
+ XFS_CHECK_SB_OFFSET(sb_inoalignmt, 180);
+ XFS_CHECK_SB_OFFSET(sb_unit, 184);
+ XFS_CHECK_SB_OFFSET(sb_width, 188);
+ XFS_CHECK_SB_OFFSET(sb_dirblklog, 192);
+ XFS_CHECK_SB_OFFSET(sb_logsectlog, 193);
+ XFS_CHECK_SB_OFFSET(sb_logsectsize, 194);
+ XFS_CHECK_SB_OFFSET(sb_logsunit, 196);
+ XFS_CHECK_SB_OFFSET(sb_features2, 200);
+ XFS_CHECK_SB_OFFSET(sb_bad_features2, 204);
+ XFS_CHECK_SB_OFFSET(sb_features_compat, 208);
+ XFS_CHECK_SB_OFFSET(sb_features_ro_compat, 212);
+ XFS_CHECK_SB_OFFSET(sb_features_incompat, 216);
+ XFS_CHECK_SB_OFFSET(sb_features_log_incompat, 220);
+ XFS_CHECK_SB_OFFSET(sb_crc, 224);
+ XFS_CHECK_SB_OFFSET(sb_spino_align, 228);
+ XFS_CHECK_SB_OFFSET(sb_pquotino, 232);
+ XFS_CHECK_SB_OFFSET(sb_lsn, 240);
+ XFS_CHECK_SB_OFFSET(sb_meta_uuid, 248);
+ XFS_CHECK_SB_OFFSET(sb_metadirino, 264);
+ XFS_CHECK_SB_OFFSET(sb_rgcount, 272);
+ XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
+ XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
+ XFS_CHECK_SB_OFFSET(sb_pad, 281);
+ XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
+ XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index fb05f44f6c75..763d941a8420 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -143,4 +143,47 @@ time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
__be32 dtimer);
__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
+static inline const char *
+xfs_dqinode_path(xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return "user";
+ case XFS_DQTYPE_GROUP:
+ return "group";
+ case XFS_DQTYPE_PROJ:
+ return "project";
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
+static inline enum xfs_metafile_type
+xfs_dqinode_metafile_type(xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return XFS_METAFILE_USRQUOTA;
+ case XFS_DQTYPE_GROUP:
+ return XFS_METAFILE_GRPQUOTA;
+ case XFS_DQTYPE_PROJ:
+ return XFS_METAFILE_PRJQUOTA;
+ }
+
+ ASSERT(0);
+ return XFS_METAFILE_UNKNOWN;
+}
+
+unsigned int xfs_dqinode_sick_mask(xfs_dqtype_t type);
+
+int xfs_dqinode_load(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dqtype_t type, struct xfs_inode **ipp);
+int xfs_dqinode_metadir_create(struct xfs_inode *dp, xfs_dqtype_t type,
+ struct xfs_inode **ipp);
+int xfs_dqinode_metadir_link(struct xfs_inode *dp, xfs_dqtype_t type,
+ struct xfs_inode *ip);
+int xfs_dqinode_mkdir_parent(struct xfs_mount *mp, struct xfs_inode **dpp);
+int xfs_dqinode_load_parent(struct xfs_trans *tp, struct xfs_inode **dpp);
+
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 198b84117df1..cebe83f7842a 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -25,6 +25,9 @@
#include "xfs_ag.h"
#include "xfs_health.h"
#include "xfs_refcount_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrefcount_btree.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -128,7 +131,7 @@ xfs_refcount_check_irec(
struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
- if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
return __this_address;
if (!xfs_refcount_check_domain(irec))
@@ -138,12 +141,43 @@ xfs_refcount_check_irec(
if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
return __this_address;
- if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+ if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
return __this_address;
return NULL;
}
+xfs_failaddr_t
+xfs_rtrefcount_check_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
+ return __this_address;
+
+ if (!xfs_refcount_check_domain(irec))
+ return __this_address;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount))
+ return __this_address;
+
+ if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
+ return __this_address;
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_refcount_check_btrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec)
+{
+ if (xfs_btree_is_rtrefcount(cur->bc_ops))
+ return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec);
+ return xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+}
+
static inline int
xfs_refcount_complain_bad_rec(
struct xfs_btree_cur *cur,
@@ -152,9 +186,15 @@ xfs_refcount_complain_bad_rec(
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_warn(mp,
+ if (xfs_btree_is_rtrefcount(cur->bc_ops)) {
+ xfs_warn(mp,
+ "RT Refcount BTree record corruption in rtgroup %u detected at %pS!",
+ cur->bc_group->xg_gno, fa);
+ } else {
+ xfs_warn(mp,
"Refcount BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_group->xg_gno, fa);
+ }
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -180,7 +220,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
+ fa = xfs_refcount_check_btrec(cur, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -853,9 +893,9 @@ xfs_refc_merge_refcount(
const struct xfs_refcount_irec *irec,
enum xfs_refc_adjust_op adjust)
{
- /* Once a record hits MAXREFCOUNT, it is pinned there forever */
- if (irec->rc_refcount == MAXREFCOUNT)
- return MAXREFCOUNT;
+ /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */
+ if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX)
+ return XFS_REFC_REFCOUNT_MAX;
return irec->rc_refcount + adjust;
}
@@ -898,7 +938,7 @@ xfs_refc_want_merge_center(
* hence we need to catch u32 addition overflows here.
*/
ulen += cleft->rc_blockcount + right->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
*ulenp = ulen;
@@ -933,7 +973,7 @@ xfs_refc_want_merge_left(
* hence we need to catch u32 addition overflows here.
*/
ulen += cleft->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
return true;
@@ -967,7 +1007,7 @@ xfs_refc_want_merge_right(
* hence we need to catch u32 addition overflows here.
*/
ulen += cright->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
return true;
@@ -1065,7 +1105,7 @@ xfs_refcount_still_have_space(
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
cur->bc_refc.shape_changes);
- overhead += cur->bc_mp->m_refc_maxlevels;
+ overhead += cur->bc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -1085,6 +1125,22 @@ xfs_refcount_still_have_space(
cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
+/* Schedule an extent free. */
+static int
+xrefc_free_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *rec)
+{
+ unsigned int flags = 0;
+
+ if (xfs_btree_is_rtrefcount(cur->bc_ops))
+ flags |= XFS_FREE_EXTENT_REALTIME;
+
+ return xfs_free_extent_later(cur->bc_tp,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock),
+ rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags);
+}
+
/*
* Adjust the refcounts of middle extents. At this point we should have
* split extents that crossed the adjustment range; merged with adjacent
@@ -1101,7 +1157,6 @@ xfs_refcount_adjust_extents(
struct xfs_refcount_irec ext, tmp;
int error;
int found_rec, found_tmp;
- xfs_fsblock_t fsbno;
/* Merging did all the work already. */
if (*aglen == 0)
@@ -1117,7 +1172,7 @@ xfs_refcount_adjust_extents(
if (error)
goto out_error;
if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
@@ -1154,12 +1209,7 @@ xfs_refcount_adjust_extents(
goto out_error;
}
} else {
- fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_ag.pag->pag_agno,
- tmp.rc_startblock);
- error = xfs_free_extent_later(cur->bc_tp, fsbno,
- tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ error = xrefc_free_extent(cur, &tmp);
if (error)
goto out_error;
}
@@ -1197,7 +1247,7 @@ xfs_refcount_adjust_extents(
* Adjust the reference count and either update the tree
* (incr) or free the blocks (decr).
*/
- if (ext.rc_refcount == MAXREFCOUNT)
+ if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX)
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur, &ext);
@@ -1217,12 +1267,7 @@ xfs_refcount_adjust_extents(
}
goto advloop;
} else {
- fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_ag.pag->pag_agno,
- ext.rc_startblock);
- error = xfs_free_extent_later(cur->bc_tp, fsbno,
- ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ error = xrefc_free_extent(cur, &ext);
if (error)
goto out_error;
}
@@ -1312,7 +1357,7 @@ xfs_refcount_continue_op(
xfs_agblock_t new_agbno)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
ri->ri_blockcount))) {
@@ -1320,10 +1365,10 @@ xfs_refcount_continue_op(
return -EFSCORRUPTED;
}
- ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ ri->ri_startblock = xfs_agbno_to_fsb(pag, new_agbno);
ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount));
- ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
+ ASSERT(pag_agno(pag) == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
return 0;
}
@@ -1360,7 +1405,7 @@ xfs_refcount_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
+ if (rcur != NULL && rcur->bc_group != ri->ri_group) {
nr_ops = rcur->bc_refc.nr_ops;
shape_changes = rcur->bc_refc.shape_changes;
xfs_btree_del_cursor(rcur, 0);
@@ -1368,13 +1413,14 @@ xfs_refcount_finish_one(
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(ri->ri_pag, tp,
+ struct xfs_perag *pag = to_perag(ri->ri_group);
+
+ error = xfs_alloc_read_agf(pag, tp,
XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
return error;
- *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
- ri->ri_pag);
+ *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
rcur->bc_refc.nr_ops = nr_ops;
rcur->bc_refc.shape_changes = shape_changes;
}
@@ -1418,12 +1464,122 @@ xfs_refcount_finish_one(
}
/*
+ * Set up a continuation a deferred rtrefcount operation by updating the
+ * intent. Checks to make sure we're not going to run off the end of the
+ * rtgroup.
+ */
+static inline int
+xfs_rtrefcount_continue_op(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_intent *ri,
+ xfs_agblock_t new_agbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno,
+ ri->ri_blockcount))) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno);
+
+ ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount));
+ return 0;
+}
+
+/*
+ * Process one of the deferred realtime refcount operations. We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ */
+int
+xfs_rtrefcount_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+ struct xfs_btree_cur *rcur = *pcur;
+ int error = 0;
+ xfs_rgblock_t bno;
+ unsigned long nr_ops = 0;
+ int shape_changes = 0;
+
+ bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock);
+
+ trace_xfs_refcount_deferred(mp, ri);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ if (rcur != NULL && rcur->bc_group != ri->ri_group) {
+ nr_ops = rcur->bc_refc.nr_ops;
+ shape_changes = rcur->bc_refc.shape_changes;
+ xfs_btree_del_cursor(rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT);
+ *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+
+ rcur->bc_refc.nr_ops = nr_ops;
+ rcur->bc_refc.shape_changes = shape_changes;
+ }
+
+ switch (ri->ri_type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_INCREASE);
+ if (error)
+ return error;
+ if (ri->ri_blockcount > 0)
+ error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_DECREASE);
+ if (error)
+ return error;
+ if (ri->ri_blockcount > 0)
+ error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+ if (error)
+ return error;
+ ri->ri_blockcount = 0;
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+ if (error)
+ return error;
+ ri->ri_blockcount = 0;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ if (!error && ri->ri_blockcount > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, ri);
+ return error;
+}
+
+/*
* Record a refcount intent for later processing.
*/
static void
__xfs_refcount_add(
struct xfs_trans *tp,
enum xfs_refcount_intent_type type,
+ bool isrt,
xfs_fsblock_t startblock,
xfs_extlen_t blockcount)
{
@@ -1435,6 +1591,7 @@ __xfs_refcount_add(
ri->ri_type = type;
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
+ ri->ri_realtime = isrt;
xfs_refcount_defer_add(tp, ri);
}
@@ -1445,12 +1602,13 @@ __xfs_refcount_add(
void
xfs_refcount_increase_extent(
struct xfs_trans *tp,
+ bool isrt,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_has_reflink(tp->t_mountp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+ __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock,
PREV->br_blockcount);
}
@@ -1460,12 +1618,13 @@ xfs_refcount_increase_extent(
void
xfs_refcount_decrease_extent(
struct xfs_trans *tp,
+ bool isrt,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_has_reflink(tp->t_mountp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+ __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock,
PREV->br_blockcount);
}
@@ -1667,7 +1826,7 @@ xfs_refcount_adjust_cow_extents(
goto out_error;
}
if (!found_rec) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
ext.rc_domain = XFS_REFC_DOMAIN_COW;
@@ -1821,6 +1980,7 @@ __xfs_refcount_cow_free(
void
xfs_refcount_alloc_cow_extent(
struct xfs_trans *tp,
+ bool isrt,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
@@ -1829,17 +1989,17 @@ xfs_refcount_alloc_cow_extent(
if (!xfs_has_reflink(mp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len);
/* Add rmap entry */
- xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
- XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+ xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
}
/* Forget a CoW staging event in the refcount btree. */
void
xfs_refcount_free_cow_extent(
struct xfs_trans *tp,
+ bool isrt,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
@@ -1849,9 +2009,8 @@ xfs_refcount_free_cow_extent(
return;
/* Remove rmap entry */
- xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
- XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
- __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+ xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len);
}
struct xfs_refcount_recovery {
@@ -1880,7 +2039,7 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
+ if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
xfs_btree_mark_sick(cur);
@@ -1895,12 +2054,13 @@ xfs_refcount_recover_extent(
/* Find and remove leftover CoW reservations. */
int
xfs_refcount_recover_cow_leftovers(
- struct xfs_mount *mp,
- struct xfs_perag *pag)
+ struct xfs_group *xg)
{
+ struct xfs_mount *mp = xg->xg_mount;
+ bool isrt = xg->xg_type == XG_TYPE_RTG;
struct xfs_trans *tp;
struct xfs_btree_cur *cur;
- struct xfs_buf *agbp;
+ struct xfs_buf *agbp = NULL;
struct xfs_refcount_recovery *rr, *n;
struct list_head debris;
union xfs_btree_irec low = {
@@ -1913,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers(
xfs_fsblock_t fsb;
int error;
- /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ /* reflink filesystems must not have groups larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG);
BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
- if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
- return -EOPNOTSUPP;
+
+ if (isrt) {
+ if (!xfs_has_rtgroups(mp))
+ return 0;
+ if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS)
+ return -EOPNOTSUPP;
+ } else {
+ if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS)
+ return -EOPNOTSUPP;
+ }
INIT_LIST_HEAD(&debris);
@@ -1934,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
- if (error)
- goto out_trans;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ if (isrt) {
+ xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
+ cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg));
+ } else {
+ error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp);
+ if (error)
+ goto out_trans;
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg));
+ }
/* Find all the leftover CoW staging extents. */
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
xfs_btree_del_cursor(cur, error);
- xfs_trans_brelse(tp, agbp);
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+ else
+ xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
xfs_trans_cancel(tp);
if (error)
goto out_free;
@@ -1956,15 +2133,15 @@ xfs_refcount_recover_cow_leftovers(
goto out_free;
/* Free the orphan record */
- fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
- rr->rr_rrec.rc_startblock);
- xfs_refcount_free_cow_extent(tp, fsb,
+ fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock);
+ xfs_refcount_free_cow_extent(tp, isrt, fsb,
rr->rr_rrec.rc_blockcount);
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ XFS_AG_RESV_NONE,
+ isrt ? XFS_FREE_EXTENT_REALTIME : 0);
if (error)
goto out_trans;
@@ -2029,7 +2206,7 @@ xfs_refcount_query_range_helper(
xfs_failaddr_t fa;
xfs_refcount_btrec_to_irec(rec, &irec);
- fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_refcount_check_btrec(cur, &irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 68acb0b1b4a8..f2e299a716a4 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -12,6 +12,7 @@ struct xfs_perag;
struct xfs_btree_cur;
struct xfs_bmbt_irec;
struct xfs_refcount_irec;
+struct xfs_rtgroup;
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
@@ -56,10 +57,11 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
- struct xfs_perag *ri_pag;
+ struct xfs_group *ri_group;
enum xfs_refcount_intent_type ri_type;
xfs_extlen_t ri_blockcount;
xfs_fsblock_t ri_startblock;
+ bool ri_realtime;
};
/* Check that the refcount is appropriate for the record domain. */
@@ -74,24 +76,25 @@ xfs_refcount_check_domain(
return true;
}
-void xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt,
struct xfs_bmbt_irec *irec);
-void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt,
struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+int xfs_refcount_finish_one(struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+int xfs_rtrefcount_finish_one(struct xfs_trans *tp,
struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
xfs_extlen_t *flen, bool find_end_of_shared);
-void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
- xfs_extlen_t len);
-void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
- xfs_extlen_t len);
-extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
- struct xfs_perag *pag);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
+ xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
+ xfs_fsblock_t fsb, xfs_extlen_t len);
+int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg);
/*
* While we're adjusting the refcounts records of an extent, we have
@@ -120,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg,
+ const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 795928d1a66d..54505fee1852 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -30,7 +30,7 @@ xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag);
+ cur->bc_ag.agbp, to_perag(cur->bc_group));
}
STATIC void
@@ -68,21 +68,20 @@ xfs_refcountbt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.pag = cur->bc_ag.pag;
+ args.pag = to_perag(cur->bc_group);
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno,
- xfs_refc_block(args.mp)));
+ xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp)));
if (error)
goto out_error;
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
}
- ASSERT(args.agno == cur->bc_ag.pag->pag_agno);
+ ASSERT(args.agno == cur->bc_group->xg_gno);
ASSERT(args.len == 1);
new->s = cpu_to_be32(args.agbno);
@@ -170,7 +169,7 @@ xfs_refcountbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_refcount_root;
}
@@ -362,11 +361,11 @@ xfs_refcountbt_init_cursor(
{
struct xfs_btree_cur *cur;
- ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
+ ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount);
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops,
mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_refc.nr_ops = 0;
cur->bc_refc.shape_changes = 0;
cur->bc_ag.agbp = agbp;
@@ -515,7 +514,7 @@ xfs_refcountbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (xfs_ag_contains_log(mp, pag->pag_agno))
+ if (xfs_ag_contains_log(mp, pag_agno(pag)))
agblocks -= mp->m_sb.sb_logblocks;
*ask += xfs_refcountbt_max_size(mp, agblocks);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 6ef4687b3aba..3cdf50563fec 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -25,6 +25,8 @@
#include "xfs_ag.h"
#include "xfs_health.h"
#include "xfs_rmap_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
struct kmem_cache *xfs_rmap_intent_cache;
@@ -213,7 +215,7 @@ xfs_rmap_check_irec(
struct xfs_perag *pag,
const struct xfs_rmap_irec *irec)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
bool is_inode;
bool is_unwritten;
bool is_bmbt;
@@ -264,14 +266,78 @@ xfs_rmap_check_irec(
return NULL;
}
+static xfs_failaddr_t
+xfs_rtrmap_check_meta_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (irec->rm_offset != 0)
+ return __this_address;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ return __this_address;
+
+ switch (irec->rm_owner) {
+ case XFS_RMAP_OWN_FS:
+ if (irec->rm_startblock != 0)
+ return __this_address;
+ if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
+ return __this_address;
+ return NULL;
+ case XFS_RMAP_OWN_COW:
+ if (!xfs_has_rtreflink(mp))
+ return __this_address;
+ if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+ irec->rm_blockcount))
+ return __this_address;
+ return NULL;
+ default:
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_rtrmap_check_inode_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (!xfs_verify_ino(mp, irec->rm_owner))
+ return __this_address;
+ if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount))
+ return __this_address;
+ if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+ return __this_address;
+ return NULL;
+}
+
+xfs_failaddr_t
+xfs_rtrmap_check_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec)
+{
+ if (irec->rm_blockcount == 0)
+ return __this_address;
+ if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK))
+ return __this_address;
+ if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner))
+ return xfs_rtrmap_check_meta_irec(rtg, irec);
+ return xfs_rtrmap_check_inode_irec(rtg, irec);
+}
+
static inline xfs_failaddr_t
xfs_rmap_check_btrec(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *irec)
{
- if (xfs_btree_is_mem_rmap(cur->bc_ops))
- return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
- return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
+ if (xfs_btree_is_rtrmap(cur->bc_ops) ||
+ xfs_btree_is_mem_rtrmap(cur->bc_ops))
+ return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec);
+ return xfs_rmap_check_irec(to_perag(cur->bc_group), irec);
}
static inline int
@@ -285,10 +351,14 @@ xfs_rmap_complain_bad_rec(
if (xfs_btree_is_mem_rmap(cur->bc_ops))
xfs_warn(mp,
"In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+ else if (xfs_btree_is_rtrmap(cur->bc_ops))
+ xfs_warn(mp,
+ "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!",
+ cur->bc_group->xg_gno, fa);
else
xfs_warn(mp,
"Reverse Mapping BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
@@ -527,7 +597,7 @@ xfs_rmap_free_check_owner(
struct xfs_btree_cur *cur,
uint64_t ltoff,
struct xfs_rmap_irec *rec,
- xfs_filblks_t len,
+ xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags)
@@ -835,7 +905,7 @@ xfs_rmap_hook_enable(void)
static inline void
xfs_rmap_update_hook(
struct xfs_trans *tp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
enum xfs_rmap_intent_type op,
xfs_agblock_t startblock,
xfs_extlen_t blockcount,
@@ -850,27 +920,27 @@ xfs_rmap_update_hook(
.oinfo = *oinfo, /* struct copy */
};
- if (pag)
- xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+ if (xg)
+ xfs_hooks_call(&xg->xg_rmap_update_hooks, op, &p);
}
}
/* Call the specified function during a reverse mapping update. */
int
xfs_rmap_hook_add(
- struct xfs_perag *pag,
+ struct xfs_group *xg,
struct xfs_rmap_hook *hook)
{
- return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+ return xfs_hooks_add(&xg->xg_rmap_update_hooks, &hook->rmap_hook);
}
/* Stop calling the specified function during a reverse mapping update. */
void
xfs_rmap_hook_del(
- struct xfs_perag *pag,
+ struct xfs_group *xg,
struct xfs_rmap_hook *hook)
{
- xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+ xfs_hooks_del(&xg->xg_rmap_update_hooks, &hook->rmap_hook);
}
/* Configure rmap update hook functions. */
@@ -905,7 +975,8 @@ xfs_rmap_free(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
- xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo);
+ xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_UNMAP, bno, len,
+ false, oinfo);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1149,7 +1220,8 @@ xfs_rmap_alloc(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
- xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo);
+ xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_MAP, bno, len, false,
+ oinfo);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -2556,6 +2628,47 @@ __xfs_rmap_finish_intent(
}
}
+static int
+xfs_rmap_finish_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rmap_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_perag *pag = to_perag(ri->ri_group);
+ struct xfs_buf *agbp = NULL;
+ int error;
+
+ /*
+ * Refresh the freelist before we start changing the rmapbt, because a
+ * shape change could cause us to allocate blocks.
+ */
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+ if (error) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+ return error;
+ }
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
+ return -EFSCORRUPTED;
+ }
+ *pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag);
+ return 0;
+}
+
+static int
+xfs_rtrmap_finish_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rmap_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ *pcur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ return 0;
+}
+
/*
* Process one of the deferred rmap operations. We pass back the
* btree cursor to maintain our lock on the rmapbt between calls.
@@ -2571,8 +2684,6 @@ xfs_rmap_finish_one(
{
struct xfs_owner_info oinfo;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *rcur = *pcur;
- struct xfs_buf *agbp = NULL;
xfs_agblock_t bno;
bool unwritten;
int error = 0;
@@ -2586,41 +2697,31 @@ xfs_rmap_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
- xfs_btree_del_cursor(rcur, 0);
- rcur = NULL;
+ if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) {
+ xfs_btree_del_cursor(*pcur, 0);
*pcur = NULL;
}
- if (rcur == NULL) {
- /*
- * Refresh the freelist before we start changing the
- * rmapbt, because a shape change could cause us to
- * allocate blocks.
- */
- error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
- if (error) {
- xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
+ if (*pcur == NULL) {
+ if (ri->ri_group->xg_type == XG_TYPE_RTG)
+ error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur);
+ else
+ error = xfs_rmap_finish_init_cursor(tp, ri, pcur);
+ if (error)
return error;
- }
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
- xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
- return -EFSCORRUPTED;
- }
-
- *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
}
xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
ri->ri_bmap.br_startoff);
unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
- bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+ bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock,
+ ri->ri_group->xg_type);
+ error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno,
ri->ri_bmap.br_blockcount, &oinfo, unwritten);
if (error)
return error;
- xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno,
+ xfs_rmap_update_hook(tp, ri->ri_group, ri->ri_type, bno,
ri->ri_bmap.br_blockcount, unwritten, &oinfo);
return 0;
}
@@ -2645,6 +2746,7 @@ __xfs_rmap_add(
struct xfs_trans *tp,
enum xfs_rmap_intent_type type,
uint64_t owner,
+ bool isrt,
int whichfork,
struct xfs_bmbt_irec *bmap)
{
@@ -2656,6 +2758,7 @@ __xfs_rmap_add(
ri->ri_owner = owner;
ri->ri_whichfork = whichfork;
ri->ri_bmap = *bmap;
+ ri->ri_realtime = isrt;
xfs_rmap_defer_add(tp, ri);
}
@@ -2669,6 +2772,7 @@ xfs_rmap_map_extent(
struct xfs_bmbt_irec *PREV)
{
enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
@@ -2676,7 +2780,7 @@ xfs_rmap_map_extent(
if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
type = XFS_RMAP_MAP_SHARED;
- __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+ __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -2688,6 +2792,7 @@ xfs_rmap_unmap_extent(
struct xfs_bmbt_irec *PREV)
{
enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
@@ -2695,7 +2800,7 @@ xfs_rmap_unmap_extent(
if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
type = XFS_RMAP_UNMAP_SHARED;
- __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+ __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
}
/*
@@ -2713,6 +2818,7 @@ xfs_rmap_convert_extent(
struct xfs_bmbt_irec *PREV)
{
enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (!xfs_rmap_update_is_needed(mp, whichfork))
return;
@@ -2720,15 +2826,15 @@ xfs_rmap_convert_extent(
if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
type = XFS_RMAP_CONVERT_SHARED;
- __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
+ __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
void
xfs_rmap_alloc_extent(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
+ bool isrt,
+ xfs_fsblock_t fsbno,
xfs_extlen_t len,
uint64_t owner)
{
@@ -2737,20 +2843,20 @@ xfs_rmap_alloc_extent(
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
return;
- bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+ bmap.br_startblock = fsbno;
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap);
}
/* Schedule the deletion of an rmap for non-file data. */
void
xfs_rmap_free_extent(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
+ bool isrt,
+ xfs_fsblock_t fsbno,
xfs_extlen_t len,
uint64_t owner)
{
@@ -2759,12 +2865,12 @@ xfs_rmap_free_extent(
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
return;
- bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
+ bmap.br_startblock = fsbno;
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap);
}
/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index b783dd4dd95d..5f39f6e53cd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -7,6 +7,7 @@
#define __XFS_RMAP_H__
struct xfs_perag;
+struct xfs_rtgroup;
static inline void
xfs_rmap_ino_bmbt_owner(
@@ -173,7 +174,8 @@ struct xfs_rmap_intent {
int ri_whichfork;
uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
- struct xfs_perag *ri_pag;
+ struct xfs_group *ri_group;
+ bool ri_realtime;
};
/* functions for updating the rmapbt based on bmbt map/unmap operations */
@@ -184,10 +186,10 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *imap);
-void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
+ xfs_extlen_t len, uint64_t owner);
+void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno,
+ xfs_extlen_t len, uint64_t owner);
int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur);
@@ -206,6 +208,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
const struct xfs_rmap_irec *irec);
+xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg,
+ const struct xfs_rmap_irec *irec);
int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, enum xbtree_recpacking *outcome);
@@ -264,8 +268,8 @@ struct xfs_rmap_hook {
void xfs_rmap_hook_disable(void);
void xfs_rmap_hook_enable(void);
-int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
-void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+int xfs_rmap_hook_add(struct xfs_group *xg, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_del(struct xfs_group *xg, struct xfs_rmap_hook *hook);
void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn);
#endif
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index ac2f1f499b76..2cab694ac58a 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -57,7 +57,7 @@ xfs_rmapbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag);
+ cur->bc_ag.agbp, to_perag(cur->bc_group));
}
STATIC void
@@ -66,14 +66,15 @@ xfs_rmapbt_set_root(
const union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_ag.agbp;
- struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
ASSERT(ptr->s != 0);
agf->agf_rmap_root = ptr->s;
be32_add_cpu(&agf->agf_rmap_level, inc);
- cur->bc_ag.pag->pagf_rmap_level += inc;
+ pag->pagf_rmap_level += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -87,7 +88,7 @@ xfs_rmapbt_alloc_block(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
struct xfs_alloc_arg args = { .len = 1 };
int error;
xfs_agblock_t bno;
@@ -102,7 +103,7 @@ xfs_rmapbt_alloc_block(
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
+ xfs_extent_busy_reuse(pag_group(pag), bno, 1, false);
new->s = cpu_to_be32(bno);
be32_add_cpu(&agf->agf_rmap_blocks, 1);
@@ -125,7 +126,7 @@ xfs_rmapbt_free_block(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
xfs_agblock_t bno;
int error;
@@ -136,7 +137,7 @@ xfs_rmapbt_free_block(
if (error)
return error;
- xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
@@ -227,7 +228,7 @@ xfs_rmapbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_rmap_root;
}
@@ -538,7 +539,7 @@ xfs_rmapbt_init_cursor(
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agf *agf = agbp->b_addr;
@@ -647,14 +648,13 @@ xfs_rmapbt_mem_cursor(
struct xfbtree *xfbt)
{
struct xfs_btree_cur *cur;
- struct xfs_mount *mp = pag->pag_mount;
- cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
+ cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops,
xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
cur->bc_mem.xfbtree = xfbt;
cur->bc_nlevels = xfbt->nlevels;
- cur->bc_mem.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
return cur;
}
@@ -863,7 +863,7 @@ xfs_rmapbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (xfs_ag_contains_log(mp, pag->pag_agno))
+ if (xfs_ag_contains_log(mp, pag_agno(pag)))
agblocks -= mp->m_sb.sb_logblocks;
/* Reserve 1% of the AG or enough for 1 block per record. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 27a4472402ba..5057536e586c 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -20,28 +20,87 @@
#include "xfs_error.h"
#include "xfs_rtbitmap.h"
#include "xfs_health.h"
+#include "xfs_sb.h"
+#include "xfs_errortag.h"
+#include "xfs_log.h"
+#include "xfs_buf_item.h"
+#include "xfs_extent_busy.h"
/*
* Realtime allocator bitmap functions shared with userspace.
*/
-/*
- * Real time buffers need verifiers to avoid runtime warnings during IO.
- * We don't have anything to verify, however, so these are just dummy
- * operations.
- */
+static xfs_failaddr_t
+xfs_rtbuf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, hdr->rt_magic))
+ return __this_address;
+ if (!xfs_has_rtgroups(mp))
+ return __this_address;
+ if (!xfs_has_crc(mp))
+ return __this_address;
+ if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+ return __this_address;
+ return NULL;
+}
+
static void
xfs_rtbuf_verify_read(
- struct xfs_buf *bp)
+ struct xfs_buf *bp)
{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ if (!xfs_has_rtgroups(mp))
+ return;
+
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) {
+ fa = __this_address;
+ goto fail;
+ }
+
+ if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) {
+ fa = __this_address;
+ goto fail;
+ }
+
+ fa = xfs_rtbuf_verify(bp);
+ if (fa)
+ goto fail;
+
return;
+fail:
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
static void
xfs_rtbuf_verify_write(
struct xfs_buf *bp)
{
- return;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ xfs_failaddr_t fa;
+
+ if (!xfs_has_rtgroups(mp))
+ return;
+
+ fa = xfs_rtbuf_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (bip)
+ hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF);
}
const struct xfs_buf_ops xfs_rtbuf_ops = {
@@ -50,6 +109,22 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
.verify_write = xfs_rtbuf_verify_write,
};
+const struct xfs_buf_ops xfs_rtbitmap_buf_ops = {
+ .name = "xfs_rtbitmap",
+ .magic = { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) },
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+ .verify_struct = xfs_rtbuf_verify,
+};
+
+const struct xfs_buf_ops xfs_rtsummary_buf_ops = {
+ .name = "xfs_rtsummary",
+ .magic = { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) },
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+ .verify_struct = xfs_rtbuf_verify,
+};
+
/* Release cached rt bitmap and summary buffers. */
void
xfs_rtbuf_cache_relse(
@@ -75,28 +150,31 @@ static int
xfs_rtbuf_get(
struct xfs_rtalloc_args *args,
xfs_fileoff_t block, /* block number in bitmap or summary */
- int issum) /* is summary not bitmap */
+ enum xfs_rtg_inodes type)
{
+ struct xfs_inode *ip = args->rtg->rtg_inodes[type];
struct xfs_mount *mp = args->mp;
struct xfs_buf **cbpp; /* cached block buffer */
xfs_fileoff_t *coffp; /* cached block number */
struct xfs_buf *bp; /* block buffer, result */
- struct xfs_inode *ip; /* bitmap or summary inode */
struct xfs_bmbt_irec map;
- enum xfs_blft type;
+ enum xfs_blft buf_type;
int nmap = 1;
int error;
- if (issum) {
+ switch (type) {
+ case XFS_RTGI_SUMMARY:
cbpp = &args->sumbp;
coffp = &args->sumoff;
- ip = mp->m_rsumip;
- type = XFS_BLFT_RTSUMMARY_BUF;
- } else {
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ break;
+ case XFS_RTGI_BITMAP:
cbpp = &args->rbmbp;
coffp = &args->rbmoff;
- ip = mp->m_rbmip;
- type = XFS_BLFT_RTBITMAP_BUF;
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+ break;
+ default:
+ return -EINVAL;
}
/*
@@ -119,22 +197,32 @@ xfs_rtbuf_get(
return error;
if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) {
- xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
- XFS_SICK_RT_BITMAP);
+ xfs_rtginode_mark_sick(args->rtg, type);
return -EFSCORRUPTED;
}
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+ mp->m_bsize, 0, &bp,
+ xfs_rtblock_ops(mp, type));
if (xfs_metadata_is_sick(error))
- xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
- XFS_SICK_RT_BITMAP);
+ xfs_rtginode_mark_sick(args->rtg, type);
if (error)
return error;
- xfs_trans_buf_set_type(args->tp, bp, type);
+ if (xfs_has_rtgroups(mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) {
+ xfs_buf_mark_corrupt(bp);
+ xfs_trans_brelse(args->tp, bp);
+ xfs_rtginode_mark_sick(args->rtg, type);
+ return -EFSCORRUPTED;
+ }
+ }
+
+ xfs_trans_buf_set_type(args->tp, bp, buf_type);
*cbpp = bp;
*coffp = block;
return 0;
@@ -148,11 +236,11 @@ xfs_rtbitmap_read_buf(
struct xfs_mount *mp = args->mp;
if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) {
- xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
+ xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_BITMAP);
return -EFSCORRUPTED;
}
- return xfs_rtbuf_get(args, block, 0);
+ return xfs_rtbuf_get(args, block, XFS_RTGI_BITMAP);
}
int
@@ -163,10 +251,10 @@ xfs_rtsummary_read_buf(
struct xfs_mount *mp = args->mp;
if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) {
- xfs_rt_mark_sick(args->mp, XFS_SICK_RT_SUMMARY);
+ xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_SUMMARY);
return -EFSCORRUPTED;
}
- return xfs_rtbuf_get(args, block, 1);
+ return xfs_rtbuf_get(args, block, XFS_RTGI_SUMMARY);
}
/*
@@ -503,6 +591,7 @@ xfs_rtmodify_summary(
{
struct xfs_mount *mp = args->mp;
xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
+ uint8_t *rsum_cache = args->rtg->rtg_rsum_cache;
unsigned int infoword;
xfs_suminfo_t val;
int error;
@@ -514,11 +603,11 @@ xfs_rtmodify_summary(
infoword = xfs_rtsumoffs_to_infoword(mp, so);
val = xfs_suminfo_add(args, infoword, delta);
- if (mp->m_rsum_cache) {
- if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
- if (val != 0 && log >= mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log + 1;
+ if (rsum_cache) {
+ if (val == 0 && log + 1 == rsum_cache[bbno])
+ rsum_cache[bbno] = log;
+ if (val != 0 && log >= rsum_cache[bbno])
+ rsum_cache[bbno] = log + 1;
}
xfs_trans_log_rtsummary(args, infoword);
@@ -737,7 +826,7 @@ xfs_rtfree_range(
/*
* Find the next allocated block (end of allocated extent).
*/
- error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1,
&postblock);
if (error)
return error;
@@ -961,19 +1050,25 @@ xfs_rtcheck_alloc_range(
int
xfs_rtfree_extent(
struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_rtgroup *rtg,
xfs_rtxnum_t start, /* starting rtext number to free */
xfs_rtxlen_t len) /* length of extent freed */
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
struct xfs_rtalloc_args args = {
.mp = mp,
.tp = tp,
+ .rtg = rtg,
};
int error;
struct timespec64 atime;
- ASSERT(mp->m_rbmip->i_itemp != NULL);
- xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
+ ASSERT(rbmip->i_itemp != NULL);
+ xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+ return -EIO;
error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
@@ -990,19 +1085,21 @@ xfs_rtfree_extent(
* Mark more blocks free in the superblock.
*/
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+
/*
* If we've now freed all the blocks, reset the file sequence
- * number to 0.
+ * number to 0 for pre-RTG file systems.
*/
- if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+ if (!xfs_has_rtgroups(mp) &&
+ tp->t_frextents_delta + mp->m_sb.sb_frextents ==
mp->m_sb.sb_rextents) {
- if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
- mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
+ rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
- atime = inode_get_atime(VFS_I(mp->m_rbmip));
+ atime = inode_get_atime(VFS_I(rbmip));
atime.tv_sec = 0;
- inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
- xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ inode_set_atime_to_ts(VFS_I(rbmip), atime);
+ xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
}
error = 0;
out:
@@ -1018,15 +1115,18 @@ out:
int
xfs_rtfree_blocks(
struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
xfs_fsblock_t rtbno,
xfs_filblks_t rtlen)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_extlen_t mod;
+ int error;
+ ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
- mod = xfs_rtb_to_rtxoff(mp, rtlen);
+ mod = xfs_blen_to_rtxoff(mp, rtlen);
if (mod) {
ASSERT(mod == 0);
return -EIO;
@@ -1038,21 +1138,31 @@ xfs_rtfree_blocks(
return -EIO;
}
- return xfs_rtfree_extent(tp, xfs_rtb_to_rtx(mp, rtbno),
- xfs_rtb_to_rtx(mp, rtlen));
+ error = xfs_rtfree_extent(tp, rtg, xfs_rtb_to_rtx(mp, rtbno),
+ xfs_extlen_to_rtxlen(mp, rtlen));
+ if (error)
+ return error;
+
+ if (xfs_has_rtgroups(mp))
+ xfs_extent_busy_insert(tp, rtg_group(rtg),
+ xfs_rtb_to_rgbno(mp, rtbno), rtlen, 0);
+
+ return 0;
}
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
xfs_rtxnum_t start,
xfs_rtxnum_t end,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_rtalloc_args args = {
+ .rtg = rtg,
.mp = mp,
.tp = tp,
};
@@ -1060,10 +1170,13 @@ xfs_rtalloc_query_range(
if (start > end)
return -EINVAL;
- if (start == end || start >= mp->m_sb.sb_rextents)
+ if (start == end || start >= rtg->rtg_extents)
return 0;
- end = min(end, mp->m_sb.sb_rextents - 1);
+ end = min(end, rtg->rtg_extents - 1);
+
+ if (xfs_has_zoned(mp))
+ return -EINVAL;
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
@@ -1086,7 +1199,7 @@ xfs_rtalloc_query_range(
rec.ar_startext = start;
rec.ar_extcount = rtend - start + 1;
- error = fn(mp, tp, &rec, priv);
+ error = fn(rtg, tp, &rec, priv);
if (error)
break;
}
@@ -1101,26 +1214,27 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
- return xfs_rtalloc_query_range(mp, tp, 0, mp->m_sb.sb_rextents - 1, fn,
+ return xfs_rtalloc_query_range(rtg, tp, 0, rtg->rtg_extents - 1, fn,
priv);
}
/* Is the given extent all free? */
int
xfs_rtalloc_extent_is_free(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
xfs_rtxnum_t start,
xfs_rtxlen_t len,
bool *is_free)
{
struct xfs_rtalloc_args args = {
- .mp = mp,
+ .mp = rtg_mount(rtg),
+ .rtg = rtg,
.tp = tp,
};
xfs_rtxnum_t end;
@@ -1136,88 +1250,78 @@ xfs_rtalloc_extent_is_free(
return 0;
}
+/* Compute the number of rt extents tracked by a single bitmap block. */
+xfs_rtxnum_t
+xfs_rtbitmap_rtx_per_rbmblock(
+ struct xfs_mount *mp)
+{
+ unsigned int rbmblock_bytes = mp->m_sb.sb_blocksize;
+
+ if (xfs_has_rtgroups(mp))
+ rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
+ return rbmblock_bytes * NBBY;
+}
+
/*
* Compute the number of rtbitmap blocks needed to track the given number of rt
* extents.
*/
xfs_filblks_t
-xfs_rtbitmap_blockcount(
+xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
- return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+ if (xfs_has_zoned(mp))
+ return 0;
+ return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
-/* Compute the number of rtsummary blocks needed to track the given rt space. */
-xfs_filblks_t
-xfs_rtsummary_blockcount(
- struct xfs_mount *mp,
- unsigned int rsumlevels,
- xfs_extlen_t rbmblocks)
+/* How many rt extents does each rtbitmap file track? */
+static inline xfs_rtbxlen_t
+xfs_rtbitmap_bitcount(
+ struct xfs_mount *mp)
{
- unsigned long long rsumwords;
+ if (!mp->m_sb.sb_rextents)
+ return 0;
- rsumwords = (unsigned long long)rsumlevels * rbmblocks;
- return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
-}
+ /* rtgroup size can be nonzero even if rextents is zero */
+ if (xfs_has_rtgroups(mp))
+ return mp->m_sb.sb_rgextents;
-/* Lock both realtime free space metadata inodes for a freespace update. */
-void
-xfs_rtbitmap_lock(
- struct xfs_mount *mp)
-{
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ return mp->m_sb.sb_rextents;
}
/*
- * Join both realtime free space metadata inodes to the transaction. The
- * ILOCKs will be released on transaction commit.
+ * Compute the number of rtbitmap blocks used for a given file system.
*/
-void
-xfs_rtbitmap_trans_join(
- struct xfs_trans *tp)
-{
- xfs_trans_ijoin(tp, tp->t_mountp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tp->t_mountp->m_rsumip, XFS_ILOCK_EXCL);
-}
-
-/* Unlock both realtime free space metadata inodes after a freespace update. */
-void
-xfs_rtbitmap_unlock(
+xfs_filblks_t
+xfs_rtbitmap_blockcount(
struct xfs_mount *mp)
{
- xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+ return xfs_rtbitmap_blockcount_len(mp, xfs_rtbitmap_bitcount(mp));
}
/*
- * Lock the realtime free space metadata inodes for a freespace scan. Callers
- * must walk metadata blocks in order of increasing file offset.
+ * Compute the geometry of the rtsummary file needed to track the given rt
+ * space.
*/
-void
-xfs_rtbitmap_lock_shared(
+xfs_filblks_t
+xfs_rtsummary_blockcount(
struct xfs_mount *mp,
- unsigned int rbmlock_flags)
+ unsigned int *rsumlevels)
{
- if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
-
- if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
-}
+ xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
+ unsigned long long rsumwords;
-/* Unlock the realtime free space metadata inodes after a freespace scan. */
-void
-xfs_rtbitmap_unlock_shared(
- struct xfs_mount *mp,
- unsigned int rbmlock_flags)
-{
- if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
- xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+ if (xfs_has_zoned(mp)) {
+ *rsumlevels = 0;
+ return 0;
+ }
- if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ *rsumlevels = xfs_compute_rextslog(rextents) + 1;
+ rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
+ return howmany_64(rsumwords, mp->m_blockwsize);
}
static int
@@ -1260,21 +1364,26 @@ out_trans_cancel:
/* Get a buffer for the block. */
static int
xfs_rtfile_initialize_block(
- struct xfs_inode *ip,
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
xfs_fsblock_t fsbno,
void *data)
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_inode *ip = rtg->rtg_inodes[type];
struct xfs_trans *tp;
struct xfs_buf *bp;
+ void *bufdata;
const size_t copylen = mp->m_blockwsize << XFS_WORDLOG;
enum xfs_blft buf_type;
int error;
- if (ip == mp->m_rsumip)
+ if (type == XFS_RTGI_BITMAP)
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+ else if (type == XFS_RTGI_SUMMARY)
buf_type = XFS_BLFT_RTSUMMARY_BUF;
else
- buf_type = XFS_BLFT_RTBITMAP_BUF;
+ return -EINVAL;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp);
if (error)
@@ -1288,13 +1397,30 @@ xfs_rtfile_initialize_block(
xfs_trans_cancel(tp);
return error;
}
+ bufdata = bp->b_addr;
xfs_trans_buf_set_type(tp, bp, buf_type);
- bp->b_ops = &xfs_rtbuf_ops;
+ bp->b_ops = xfs_rtblock_ops(mp, type);
+
+ if (xfs_has_rtgroups(mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ if (type == XFS_RTGI_BITMAP)
+ hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+ else
+ hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
+ hdr->rt_owner = cpu_to_be64(ip->i_ino);
+ hdr->rt_blkno = cpu_to_be64(XFS_FSB_TO_DADDR(mp, fsbno));
+ hdr->rt_lsn = 0;
+ uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid);
+
+ bufdata += sizeof(*hdr);
+ }
+
if (data)
- memcpy(bp->b_addr, data, copylen);
+ memcpy(bufdata, data, copylen);
else
- memset(bp->b_addr, 0, copylen);
+ memset(bufdata, 0, copylen);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
return xfs_trans_commit(tp);
}
@@ -1306,12 +1432,13 @@ xfs_rtfile_initialize_block(
*/
int
xfs_rtfile_initialize_blocks(
- struct xfs_inode *ip, /* inode (bitmap/summary) */
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
xfs_fileoff_t offset_fsb, /* offset to start from */
xfs_fileoff_t end_fsb, /* offset to allocate to */
void *data) /* data to fill the blocks */
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = rtg_mount(rtg);
const size_t copylen = mp->m_blockwsize << XFS_WORDLOG;
while (offset_fsb < end_fsb) {
@@ -1319,8 +1446,8 @@ xfs_rtfile_initialize_blocks(
xfs_filblks_t i;
int error;
- error = xfs_rtfile_alloc_blocks(ip, offset_fsb,
- end_fsb - offset_fsb, &map);
+ error = xfs_rtfile_alloc_blocks(rtg->rtg_inodes[type],
+ offset_fsb, end_fsb - offset_fsb, &map);
if (error)
return error;
@@ -1330,7 +1457,7 @@ xfs_rtfile_initialize_blocks(
* Do this one block per transaction, to keep it simple.
*/
for (i = 0; i < map.br_blockcount; i++) {
- error = xfs_rtfile_initialize_block(ip,
+ error = xfs_rtfile_initialize_block(rtg, type,
map.br_startblock + i, data);
if (error)
return error;
@@ -1343,3 +1470,35 @@ xfs_rtfile_initialize_blocks(
return 0;
}
+
+int
+xfs_rtbitmap_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
+ if (init && !xfs_has_rtgroups(mp)) {
+ ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ inode_set_atime(VFS_I(ip), 0, 0);
+ }
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+int
+xfs_rtsummary_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ ip->i_disk_size = mp->m_rsumblocks * mp->m_sb.sb_blocksize;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 140513d1d6bc..22e5d9cd95f4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -6,7 +6,10 @@
#ifndef __XFS_RTBITMAP_H__
#define __XFS_RTBITMAP_H__
+#include "xfs_rtgroup.h"
+
struct xfs_rtalloc_args {
+ struct xfs_rtgroup *rtg;
struct xfs_mount *mp;
struct xfs_trans *tp;
@@ -19,13 +22,37 @@ struct xfs_rtalloc_args {
static inline xfs_rtblock_t
xfs_rtx_to_rtb(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
xfs_rtxnum_t rtx)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg));
+
+ if (mp->m_rtxblklog >= 0)
+ return start + (rtx << mp->m_rtxblklog);
+ return start + (rtx * mp->m_sb.sb_rextsize);
+}
+
+/* Convert an rgbno into an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rgbno_to_rtx(
+ struct xfs_mount *mp,
+ xfs_rgblock_t rgbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rgbno >> mp->m_rtxblklog;
+ return rgbno / mp->m_sb.sb_rextsize;
+}
+
+static inline uint64_t
+xfs_rtbxlen_to_blen(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtbxlen)
+{
if (mp->m_rtxblklog >= 0)
- return rtx << mp->m_rtxblklog;
+ return rtbxlen << mp->m_rtxblklog;
- return rtx * mp->m_sb.sb_rextsize;
+ return rtbxlen * mp->m_sb.sb_rextsize;
}
static inline xfs_extlen_t
@@ -62,66 +89,90 @@ xfs_extlen_to_rtxlen(
return len / mp->m_sb.sb_rextsize;
}
+/* Convert an rt block count into an rt extent count. */
+static inline xfs_rtbxlen_t
+xfs_blen_to_rtbxlen(
+ struct xfs_mount *mp,
+ uint64_t blen)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return blen >> mp->m_rtxblklog;
+
+ return div_u64(blen, mp->m_sb.sb_rextsize);
+}
+
+/* Return the offset of a file block length within an rt extent. */
+static inline xfs_extlen_t
+xfs_blen_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_filblks_t blen)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return blen & mp->m_rtxblkmask;
+
+ return do_div(blen, mp->m_sb.sb_rextsize);
+}
+
+/* Round this block count up to the nearest rt extent size. */
+static inline xfs_filblks_t
+xfs_blen_roundup_rtx(
+ struct xfs_mount *mp,
+ xfs_filblks_t blen)
+{
+ return roundup_64(blen, mp->m_sb.sb_rextsize);
+}
+
/* Convert an rt block number into an rt extent number. */
static inline xfs_rtxnum_t
xfs_rtb_to_rtx(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
{
+ /* open-coded 64-bit masking operation */
+ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask;
if (likely(mp->m_rtxblklog >= 0))
return rtbno >> mp->m_rtxblklog;
-
return div_u64(rtbno, mp->m_sb.sb_rextsize);
}
+/* Return the offset of a rtgroup block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rgbno_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_rgblock_t rgbno)
+{
+ return rgbno % mp->m_sb.sb_rextsize;
+}
+
/* Return the offset of an rt block number within an rt extent. */
static inline xfs_extlen_t
xfs_rtb_to_rtxoff(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
{
+ /* open-coded 64-bit masking operation */
+ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask;
if (likely(mp->m_rtxblklog >= 0))
return rtbno & mp->m_rtxblkmask;
-
return do_div(rtbno, mp->m_sb.sb_rextsize);
}
-/*
- * Convert an rt block number into an rt extent number, rounding up to the next
- * rt extent if the rt block is not aligned to an rt extent boundary.
- */
-static inline xfs_rtxnum_t
-xfs_rtb_to_rtxup(
- struct xfs_mount *mp,
- xfs_rtblock_t rtbno)
-{
- if (likely(mp->m_rtxblklog >= 0)) {
- if (rtbno & mp->m_rtxblkmask)
- return (rtbno >> mp->m_rtxblklog) + 1;
- return rtbno >> mp->m_rtxblklog;
- }
-
- if (do_div(rtbno, mp->m_sb.sb_rextsize))
- rtbno++;
- return rtbno;
-}
-
-/* Round this rtblock up to the nearest rt extent size. */
+/* Round this file block offset up to the nearest rt extent size. */
static inline xfs_rtblock_t
-xfs_rtb_roundup_rtx(
+xfs_fileoff_roundup_rtx(
struct xfs_mount *mp,
- xfs_rtblock_t rtbno)
+ xfs_fileoff_t off)
{
- return roundup_64(rtbno, mp->m_sb.sb_rextsize);
+ return roundup_64(off, mp->m_sb.sb_rextsize);
}
-/* Round this rtblock down to the nearest rt extent size. */
+/* Round this file block offset down to the nearest rt extent size. */
static inline xfs_rtblock_t
-xfs_rtb_rounddown_rtx(
+xfs_fileoff_rounddown_rtx(
struct xfs_mount *mp,
- xfs_rtblock_t rtbno)
+ xfs_fileoff_t off)
{
- return rounddown_64(rtbno, mp->m_sb.sb_rextsize);
+ return rounddown_64(off, mp->m_sb.sb_rextsize);
}
/* Convert an rt extent number to a file block offset in the rt bitmap file. */
@@ -130,6 +181,9 @@ xfs_rtx_to_rbmblock(
struct xfs_mount *mp,
xfs_rtxnum_t rtx)
{
+ if (xfs_has_rtgroups(mp))
+ return div_u64(rtx, mp->m_rtx_per_rbmblock);
+
return rtx >> mp->m_blkbit_log;
}
@@ -139,6 +193,13 @@ xfs_rtx_to_rbmword(
struct xfs_mount *mp,
xfs_rtxnum_t rtx)
{
+ if (xfs_has_rtgroups(mp)) {
+ unsigned int mod;
+
+ div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod);
+ return mod;
+ }
+
return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
}
@@ -148,6 +209,9 @@ xfs_rbmblock_to_rtx(
struct xfs_mount *mp,
xfs_fileoff_t rbmoff)
{
+ if (xfs_has_rtgroups(mp))
+ return rbmoff * mp->m_rtx_per_rbmblock;
+
return rbmoff << mp->m_blkbit_log;
}
@@ -157,7 +221,14 @@ xfs_rbmblock_wordptr(
struct xfs_rtalloc_args *args,
unsigned int index)
{
- union xfs_rtword_raw *words = args->rbmbp->b_addr;
+ struct xfs_mount *mp = args->mp;
+ union xfs_rtword_raw *words;
+ struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr;
+
+ if (xfs_has_rtgroups(mp))
+ words = (union xfs_rtword_raw *)(hdr + 1);
+ else
+ words = args->rbmbp->b_addr;
return words + index;
}
@@ -170,6 +241,8 @@ xfs_rtbitmap_getword(
{
union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+ if (xfs_has_rtgroups(args->mp))
+ return be32_to_cpu(word->rtg);
return word->old;
}
@@ -182,7 +255,10 @@ xfs_rtbitmap_setword(
{
union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
- word->old = value;
+ if (xfs_has_rtgroups(args->mp))
+ word->rtg = cpu_to_be32(value);
+ else
+ word->old = value;
}
/*
@@ -207,6 +283,9 @@ xfs_rtsumoffs_to_block(
struct xfs_mount *mp,
xfs_rtsumoff_t rsumoff)
{
+ if (xfs_has_rtgroups(mp))
+ return rsumoff / mp->m_blockwsize;
+
return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
}
@@ -221,6 +300,9 @@ xfs_rtsumoffs_to_infoword(
{
unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG;
+ if (xfs_has_rtgroups(mp))
+ return rsumoff % mp->m_blockwsize;
+
return rsumoff & mask;
}
@@ -230,7 +312,13 @@ xfs_rsumblock_infoptr(
struct xfs_rtalloc_args *args,
unsigned int index)
{
- union xfs_suminfo_raw *info = args->sumbp->b_addr;
+ union xfs_suminfo_raw *info;
+ struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr;
+
+ if (xfs_has_rtgroups(args->mp))
+ info = (union xfs_suminfo_raw *)(hdr + 1);
+ else
+ info = args->sumbp->b_addr;
return info + index;
}
@@ -243,6 +331,8 @@ xfs_suminfo_get(
{
union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+ if (xfs_has_rtgroups(args->mp))
+ return be32_to_cpu(info->rtg);
return info->old;
}
@@ -255,10 +345,28 @@ xfs_suminfo_add(
{
union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+ if (xfs_has_rtgroups(args->mp)) {
+ be32_add_cpu(&info->rtg, delta);
+ return be32_to_cpu(info->rtg);
+ }
+
info->old += delta;
return info->old;
}
+static inline const struct xfs_buf_ops *
+xfs_rtblock_ops(
+ struct xfs_mount *mp,
+ enum xfs_rtg_inodes type)
+{
+ if (xfs_has_rtgroups(mp)) {
+ if (type == XFS_RTGI_SUMMARY)
+ return &xfs_rtsummary_buf_ops;
+ return &xfs_rtbitmap_buf_ops;
+ }
+ return &xfs_rtbuf_ops;
+}
+
/*
* Functions for walking free space rtextents in the realtime bitmap.
*/
@@ -268,7 +376,7 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv);
@@ -291,53 +399,43 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
xfs_fileoff_t bbno, int delta);
int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxlen_t len);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp,
xfs_rtxnum_t start, xfs_rtxnum_t end,
xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtxnum_t start, xfs_rtxlen_t len,
- bool *is_free);
-/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtxnum_t start, /* starting rtext number to free */
- xfs_rtxlen_t len); /* length of extent freed */
-
+int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp,
+ xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free);
+int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_rtxnum_t start, xfs_rtxlen_t len);
/* Same as above, but in units of rt blocks. */
-int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
- xfs_filblks_t rtlen);
+int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_fsblock_t rtbno, xfs_filblks_t rtlen);
-xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
- rtextents);
+xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp);
+xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp);
+xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents);
xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
- unsigned int rsumlevels, xfs_extlen_t rbmblocks);
-
-int xfs_rtfile_initialize_blocks(struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data);
+ unsigned int *rsumlevels);
-void xfs_rtbitmap_lock(struct xfs_mount *mp);
-void xfs_rtbitmap_unlock(struct xfs_mount *mp);
-void xfs_rtbitmap_trans_join(struct xfs_trans *tp);
+int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb, void *data);
+int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
-/* Lock the rt bitmap inode in shared mode */
-#define XFS_RBMLOCK_BITMAP (1U << 0)
-/* Lock the rt summary inode in shared mode */
-#define XFS_RBMLOCK_SUMMARY (1U << 1)
-
-void xfs_rtbitmap_lock_shared(struct xfs_mount *mp,
- unsigned int rbmlock_flags);
-void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
- unsigned int rbmlock_flags);
#else /* CONFIG_XFS_RT */
# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
-# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
+
+static inline int xfs_rtfree_blocks(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen)
+{
+ return -ENOSYS;
+}
# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS)
@@ -345,17 +443,11 @@ void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
# define xfs_rtbuf_cache_relse(a) (0)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
static inline xfs_filblks_t
-xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
+xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
{
/* shut up gcc */
return 0;
}
-# define xfs_rtsummary_blockcount(mp, l, b) (0)
-# define xfs_rtbitmap_lock(mp) do { } while (0)
-# define xfs_rtbitmap_trans_join(tp) do { } while (0)
-# define xfs_rtbitmap_unlock(mp) do { } while (0)
-# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0)
-# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0)
#endif /* CONFIG_XFS_RT */
#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
new file mode 100644
index 000000000000..9186c58e83d5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -0,0 +1,750 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_buf_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
+#include "xfs_metadir.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
+
+/* Find the first usable fsblock in this rtgroup. */
+static inline uint32_t
+xfs_rtgroup_min_block(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ if (xfs_has_rtsb(mp) && rgno == 0)
+ return mp->m_sb.sb_rextsize;
+
+ return 0;
+}
+
+/* Precompute this group's geometry */
+void
+xfs_rtgroup_calc_geometry(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents)
+{
+ rtg->rtg_extents = __xfs_rtgroup_extents(mp, rgno, rgcount, rextents);
+ rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+ rtg_group(rtg)->xg_min_gbno = xfs_rtgroup_min_block(mp, rgno);
+}
+
+int
+xfs_rtgroup_alloc(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents)
+{
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL);
+ if (!rtg)
+ return -ENOMEM;
+
+ xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents);
+
+ error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG);
+ if (error)
+ goto out_free_rtg;
+ return 0;
+
+out_free_rtg:
+ kfree(rtg);
+ return error;
+}
+
+void
+xfs_rtgroup_free(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ xfs_group_free(mp, rgno, XG_TYPE_RTG, NULL);
+}
+
+/* Free a range of incore rtgroup objects. */
+void
+xfs_free_rtgroups(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno)
+{
+ xfs_rgnumber_t rgno;
+
+ for (rgno = first_rgno; rgno < end_rgno; rgno++)
+ xfs_rtgroup_free(mp, rgno);
+}
+
+/* Initialize some range of incore rtgroup objects. */
+int
+xfs_initialize_rtgroups(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno,
+ xfs_rtbxlen_t rextents)
+{
+ xfs_rgnumber_t index;
+ int error;
+
+ if (first_rgno >= end_rgno)
+ return 0;
+
+ for (index = first_rgno; index < end_rgno; index++) {
+ error = xfs_rtgroup_alloc(mp, index, end_rgno, rextents);
+ if (error)
+ goto out_unwind_new_rtgs;
+ }
+
+ return 0;
+
+out_unwind_new_rtgs:
+ xfs_free_rtgroups(mp, first_rgno, index);
+ return error;
+}
+
+/* Compute the number of rt extents in this realtime group. */
+xfs_rtxnum_t
+__xfs_rtgroup_extents(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents)
+{
+ ASSERT(rgno < rgcount);
+ if (rgno == rgcount - 1)
+ return rextents - ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);
+
+ ASSERT(xfs_has_rtgroups(mp));
+ return mp->m_sb.sb_rgextents;
+}
+
+xfs_rtxnum_t
+xfs_rtgroup_extents(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ return __xfs_rtgroup_extents(mp, rgno, mp->m_sb.sb_rgcount,
+ mp->m_sb.sb_rextents);
+}
+
+/*
+ * Update the rt extent count of the previous tail rtgroup if it changed during
+ * recovery (i.e. recovery of a growfs).
+ */
+int
+xfs_update_last_rtgroup_size(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t prev_rgcount)
+{
+ struct xfs_rtgroup *rtg;
+
+ ASSERT(prev_rgcount > 0);
+
+ rtg = xfs_rtgroup_grab(mp, prev_rgcount - 1);
+ if (!rtg)
+ return -EFSCORRUPTED;
+ rtg->rtg_extents = __xfs_rtgroup_extents(mp, prev_rgcount - 1,
+ mp->m_sb.sb_rgcount, mp->m_sb.sb_rextents);
+ rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+ xfs_rtgroup_rele(rtg);
+ return 0;
+}
+
+/* Lock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_lock(
+ struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags)
+{
+ ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+ ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+ !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ /*
+ * Lock both realtime free space metadata inodes for a
+ * freespace update.
+ */
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
+ }
+
+ if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+ xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
+}
+
+/* Unlock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_unlock(
+ struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags)
+{
+ ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+ ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+ !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+ xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
+ }
+}
+
+/*
+ * Join realtime group metadata inodes to the transaction. The ILOCKs will be
+ * released on transaction commit.
+ */
+void
+xfs_rtgroup_trans_join(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags)
+{
+ ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+ ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
+
+ if (!xfs_has_zoned(rtg_mount(rtg)) &&
+ (rtglock_flags & XFS_RTGLOCK_BITMAP)) {
+ xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
+ }
+
+ if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
+ xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL);
+}
+
+/* Retrieve rt group geometry. */
+int
+xfs_rtgroup_get_geometry(
+ struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo)
+{
+ /* Fill out form. */
+ memset(rgeo, 0, sizeof(*rgeo));
+ rgeo->rg_number = rtg_rgno(rtg);
+ rgeo->rg_length = rtg_blocks(rtg);
+ xfs_rtgroup_geom_health(rtg, rgeo);
+ return 0;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static struct lock_class_key xfs_rtginode_lock_class;
+
+static int
+xfs_rtginode_ilock_cmp_fn(
+ const struct lockdep_map *m1,
+ const struct lockdep_map *m2)
+{
+ const struct xfs_inode *ip1 =
+ container_of(m1, struct xfs_inode, i_lock.dep_map);
+ const struct xfs_inode *ip2 =
+ container_of(m2, struct xfs_inode, i_lock.dep_map);
+
+ if (ip1->i_projid < ip2->i_projid)
+ return -1;
+ if (ip1->i_projid > ip2->i_projid)
+ return 1;
+ return 0;
+}
+
+static inline void
+xfs_rtginode_ilock_print_fn(
+ const struct lockdep_map *m)
+{
+ const struct xfs_inode *ip =
+ container_of(m, struct xfs_inode, i_lock.dep_map);
+
+ printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid,
+ xfs_metafile_type_str(ip->i_metatype));
+}
+
+/*
+ * Most of the time each of the RTG inode locks are only taken one at a time.
+ * But when committing deferred ops, more than one of a kind can be taken.
+ * However, deferred rt ops will be committed in rgno order so there is no
+ * potential for deadlocks. The code here is needed to tell lockdep about this
+ * order.
+ */
+static inline void
+xfs_rtginode_lockdep_setup(
+ struct xfs_inode *ip,
+ xfs_rgnumber_t rgno,
+ enum xfs_rtg_inodes type)
+{
+ lockdep_set_class_and_subclass(&ip->i_lock, &xfs_rtginode_lock_class,
+ type);
+ lock_set_cmp_fn(&ip->i_lock, xfs_rtginode_ilock_cmp_fn,
+ xfs_rtginode_ilock_print_fn);
+}
+#else
+#define xfs_rtginode_lockdep_setup(ip, rgno, type) do { } while (0)
+#endif /* CONFIG_PROVE_LOCKING */
+
+struct xfs_rtginode_ops {
+ const char *name; /* short name */
+
+ enum xfs_metafile_type metafile_type;
+
+ unsigned int sick; /* rtgroup sickness flag */
+
+ unsigned int fmt_mask; /* all valid data fork formats */
+
+ /* Does the fs have this feature? */
+ bool (*enabled)(const struct xfs_mount *mp);
+
+ /* Create this rtgroup metadata inode and initialize it. */
+ int (*create)(struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init);
+};
+
+static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
+ [XFS_RTGI_BITMAP] = {
+ .name = "bitmap",
+ .metafile_type = XFS_METAFILE_RTBITMAP,
+ .sick = XFS_SICK_RG_BITMAP,
+ .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
+ (1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
+ .create = xfs_rtbitmap_create,
+ },
+ [XFS_RTGI_SUMMARY] = {
+ .name = "summary",
+ .metafile_type = XFS_METAFILE_RTSUMMARY,
+ .sick = XFS_SICK_RG_SUMMARY,
+ .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
+ (1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
+ .create = xfs_rtsummary_create,
+ },
+ [XFS_RTGI_RMAP] = {
+ .name = "rmap",
+ .metafile_type = XFS_METAFILE_RTRMAP,
+ .sick = XFS_SICK_RG_RMAPBT,
+ .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE,
+ /*
+ * growfs must create the rtrmap inodes before adding a
+ * realtime volume to the filesystem, so we cannot use the
+ * rtrmapbt predicate here.
+ */
+ .enabled = xfs_has_rmapbt,
+ .create = xfs_rtrmapbt_create,
+ },
+ [XFS_RTGI_REFCOUNT] = {
+ .name = "refcount",
+ .metafile_type = XFS_METAFILE_RTREFCOUNT,
+ .sick = XFS_SICK_RG_REFCNTBT,
+ .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE,
+ /* same comment about growfs and rmap inodes applies here */
+ .enabled = xfs_has_reflink,
+ .create = xfs_rtrefcountbt_create,
+ },
+};
+
+/* Return the shortname of this rtgroup inode. */
+const char *
+xfs_rtginode_name(
+ enum xfs_rtg_inodes type)
+{
+ return xfs_rtginode_ops[type].name;
+}
+
+/* Return the metafile type of this rtgroup inode. */
+enum xfs_metafile_type
+xfs_rtginode_metafile_type(
+ enum xfs_rtg_inodes type)
+{
+ return xfs_rtginode_ops[type].metafile_type;
+}
+
+/* Should this rtgroup inode be present? */
+bool
+xfs_rtginode_enabled(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type)
+{
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+
+ if (!ops->enabled)
+ return true;
+ return ops->enabled(rtg_mount(rtg));
+}
+
+/* Mark an rtgroup inode sick */
+void
+xfs_rtginode_mark_sick(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type)
+{
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+
+ xfs_group_mark_sick(rtg_group(rtg), ops->sick);
+}
+
+/* Load and existing rtgroup inode into the rtgroup structure. */
+int
+xfs_rtginode_load(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip;
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+ int error;
+
+ if (!xfs_rtginode_enabled(rtg, type))
+ return 0;
+
+ if (!xfs_has_rtgroups(mp)) {
+ xfs_ino_t ino;
+
+ switch (type) {
+ case XFS_RTGI_BITMAP:
+ ino = mp->m_sb.sb_rbmino;
+ break;
+ case XFS_RTGI_SUMMARY:
+ ino = mp->m_sb.sb_rsumino;
+ break;
+ default:
+ /* None of the other types exist on !rtgroups */
+ return 0;
+ }
+
+ error = xfs_trans_metafile_iget(tp, ino, ops->metafile_type,
+ &ip);
+ } else {
+ const char *path;
+
+ if (!mp->m_rtdirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ path = xfs_rtginode_path(rtg_rgno(rtg), type);
+ if (!path)
+ return -ENOMEM;
+ error = xfs_metadir_load(tp, mp->m_rtdirip, path,
+ ops->metafile_type, &ip);
+ kfree(path);
+ }
+
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_rtginode_mark_sick(rtg, type);
+ return error;
+ }
+
+ if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) {
+ xfs_irele(ip);
+ xfs_rtginode_mark_sick(rtg, type);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_projid != rtg_rgno(rtg))) {
+ xfs_irele(ip);
+ xfs_rtginode_mark_sick(rtg, type);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_rtginode_lockdep_setup(ip, rtg_rgno(rtg), type);
+ rtg->rtg_inodes[type] = ip;
+ return 0;
+}
+
+/* Release an rtgroup metadata inode. */
+void
+xfs_rtginode_irele(
+ struct xfs_inode **ipp)
+{
+ if (*ipp)
+ xfs_irele(*ipp);
+ *ipp = NULL;
+}
+
+/* Add a metadata inode for a realtime rmap btree. */
+int
+xfs_rtginode_create(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
+ bool init)
+{
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_metadir_update upd = {
+ .dp = mp->m_rtdirip,
+ .metafile_type = ops->metafile_type,
+ };
+ int error;
+
+ if (!xfs_rtginode_enabled(rtg, type))
+ return 0;
+
+ if (!mp->m_rtdirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ upd.path = xfs_rtginode_path(rtg_rgno(rtg), type);
+ if (!upd.path)
+ return -ENOMEM;
+
+ error = xfs_metadir_start_create(&upd);
+ if (error)
+ goto out_path;
+
+ error = xfs_metadir_create(&upd, S_IFREG);
+ if (error)
+ goto out_cancel;
+
+ xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type);
+
+ upd.ip->i_projid = rtg_rgno(rtg);
+ error = ops->create(rtg, upd.ip, upd.tp, init);
+ if (error)
+ goto out_cancel;
+
+ error = xfs_metadir_commit(&upd);
+ if (error)
+ goto out_path;
+
+ kfree(upd.path);
+ xfs_finish_inode_setup(upd.ip);
+ rtg->rtg_inodes[type] = upd.ip;
+ return 0;
+
+out_cancel:
+ xfs_metadir_cancel(&upd, error);
+ /* Have to finish setting up the inode to ensure it's deleted. */
+ if (upd.ip) {
+ xfs_finish_inode_setup(upd.ip);
+ xfs_irele(upd.ip);
+ }
+out_path:
+ kfree(upd.path);
+ return error;
+}
+
+/* Create the parent directory for all rtgroup inodes and load it. */
+int
+xfs_rtginode_mkdir_parent(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_mkdir(mp->m_metadirip, "rtgroups", &mp->m_rtdirip);
+}
+
+/* Load the parent directory of all rtgroup inodes. */
+int
+xfs_rtginode_load_parent(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_load(tp, mp->m_metadirip, "rtgroups",
+ XFS_METAFILE_DIR, &mp->m_rtdirip);
+}
+
+/* Check superblock fields for a read or a write. */
+static xfs_failaddr_t
+xfs_rtsb_verify_common(
+ struct xfs_buf *bp)
+{
+ struct xfs_rtsb *rsb = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, rsb->rsb_magicnum))
+ return __this_address;
+ if (rsb->rsb_pad)
+ return __this_address;
+
+ /* Everything to the end of the fs block must be zero */
+ if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb)))
+ return __this_address;
+
+ return NULL;
+}
+
+/* Check superblock fields for a read or revalidation. */
+static inline xfs_failaddr_t
+xfs_rtsb_verify_all(
+ struct xfs_buf *bp)
+{
+ struct xfs_rtsb *rsb = bp->b_addr;
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtsb_verify_common(bp);
+ if (fa)
+ return fa;
+
+ if (memcmp(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX))
+ return __this_address;
+ if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid))
+ return __this_address;
+ if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+
+ return NULL;
+}
+
+static void
+xfs_rtsb_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF)) {
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ return;
+ }
+
+ fa = xfs_rtsb_verify_all(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+static void
+xfs_rtsb_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtsb_verify_common(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_rtsb_buf_ops = {
+ .name = "xfs_rtsb",
+ .magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) },
+ .verify_read = xfs_rtsb_read_verify,
+ .verify_write = xfs_rtsb_write_verify,
+ .verify_struct = xfs_rtsb_verify_all,
+};
+
+/* Update a realtime superblock from the primary fs super */
+void
+xfs_update_rtsb(
+ struct xfs_buf *rtsb_bp,
+ const struct xfs_buf *sb_bp)
+{
+ const struct xfs_dsb *dsb = sb_bp->b_addr;
+ struct xfs_rtsb *rsb = rtsb_bp->b_addr;
+ const uuid_t *meta_uuid;
+
+ rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC);
+
+ rsb->rsb_pad = 0;
+ memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX);
+
+ memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid));
+
+ /*
+ * The metadata uuid is the fs uuid if the metauuid feature is not
+ * enabled.
+ */
+ if (dsb->sb_features_incompat &
+ cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID))
+ meta_uuid = &dsb->sb_meta_uuid;
+ else
+ meta_uuid = &dsb->sb_uuid;
+ memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid));
+}
+
+/*
+ * Update the realtime superblock from a filesystem superblock and log it to
+ * the given transaction.
+ */
+struct xfs_buf *
+xfs_log_rtsb(
+ struct xfs_trans *tp,
+ const struct xfs_buf *sb_bp)
+{
+ struct xfs_buf *rtsb_bp;
+
+ if (!xfs_has_rtsb(tp->t_mountp))
+ return NULL;
+
+ rtsb_bp = xfs_trans_getrtsb(tp);
+ if (!rtsb_bp) {
+ /*
+ * It's possible for the rtgroups feature to be enabled but
+ * there is no incore rt superblock buffer if the rt geometry
+ * was specified at mkfs time but the rt section has not yet
+ * been attached. In this case, rblocks must be zero.
+ */
+ ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0);
+ return NULL;
+ }
+
+ xfs_update_rtsb(rtsb_bp, sb_bp);
+ xfs_trans_ordered_buf(tp, rtsb_bp);
+ return rtsb_bp;
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
new file mode 100644
index 000000000000..d36a6ae0abe5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBXFS_RTGROUP_H
+#define __LIBXFS_RTGROUP_H 1
+
+#include "xfs_group.h"
+
+struct xfs_mount;
+struct xfs_trans;
+
+enum xfs_rtg_inodes {
+ XFS_RTGI_BITMAP, /* allocation bitmap */
+ XFS_RTGI_SUMMARY, /* allocation summary */
+ XFS_RTGI_RMAP, /* rmap btree inode */
+ XFS_RTGI_REFCOUNT, /* refcount btree inode */
+
+ XFS_RTGI_MAX,
+};
+
+#ifdef MAX_LOCKDEP_SUBCLASSES
+static_assert(XFS_RTGI_MAX <= MAX_LOCKDEP_SUBCLASSES);
+#endif
+
+/*
+ * Realtime group incore structure, similar to the per-AG structure.
+ */
+struct xfs_rtgroup {
+ struct xfs_group rtg_group;
+
+ /* per-rtgroup metadata inodes */
+ struct xfs_inode *rtg_inodes[XFS_RTGI_MAX];
+
+ /* Number of blocks in this group */
+ xfs_rtxnum_t rtg_extents;
+
+ /*
+ * For bitmap based RT devices this points to a cache of rt summary
+ * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
+ * > the maximum i for which rsum[i][bbno] != 0, or 0 if
+ * rsum[i][bbno] == 0 for all i.
+ * Reads and writes are serialized by the rsumip inode lock.
+ *
+ * For zoned RT devices this points to the open zone structure for
+ * a group that is open for writers, or is NULL.
+ */
+ union {
+ uint8_t *rtg_rsum_cache;
+ struct xfs_open_zone *rtg_open_zone;
+ };
+};
+
+/*
+ * For zoned RT devices this is set on groups that have no written blocks
+ * and can be picked by the allocator for opening.
+ */
+#define XFS_RTG_FREE XA_MARK_0
+
+/*
+ * For zoned RT devices this is set on groups that are fully written and that
+ * have unused blocks. Used by the garbage collection to pick targets.
+ */
+#define XFS_RTG_RECLAIMABLE XA_MARK_1
+
+static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
+{
+ return container_of(xg, struct xfs_rtgroup, rtg_group);
+}
+
+static inline struct xfs_group *rtg_group(struct xfs_rtgroup *rtg)
+{
+ return &rtg->rtg_group;
+}
+
+static inline struct xfs_mount *rtg_mount(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_mount;
+}
+
+static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_gno;
+}
+
+static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_block_count;
+}
+
+static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_BITMAP];
+}
+
+static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+}
+
+static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_RMAP];
+}
+
+static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_REFCOUNT];
+}
+
+/* Passive rtgroup references */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_get(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ return to_rtg(xfs_group_get(mp, rgno, XG_TYPE_RTG));
+}
+
+static inline struct xfs_rtgroup *
+xfs_rtgroup_hold(
+ struct xfs_rtgroup *rtg)
+{
+ return to_rtg(xfs_group_hold(rtg_group(rtg)));
+}
+
+static inline void
+xfs_rtgroup_put(
+ struct xfs_rtgroup *rtg)
+{
+ xfs_group_put(rtg_group(rtg));
+}
+
+/* Active rtgroup references */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_grab(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ return to_rtg(xfs_group_grab(mp, rgno, XG_TYPE_RTG));
+}
+
+static inline void
+xfs_rtgroup_rele(
+ struct xfs_rtgroup *rtg)
+{
+ xfs_group_rele(rtg_group(rtg));
+}
+
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next_range(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t start_rgno,
+ xfs_rgnumber_t end_rgno)
+{
+ return to_rtg(xfs_group_next_range(mp, rtg ? rtg_group(rtg) : NULL,
+ start_rgno, end_rgno, XG_TYPE_RTG));
+}
+
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg)
+{
+ return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1);
+}
+
+static inline bool
+xfs_verify_rgbno(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t rgbno)
+{
+ ASSERT(xfs_has_rtgroups(rtg_mount(rtg)));
+
+ return xfs_verify_gbno(rtg_group(rtg), rgbno);
+}
+
+/*
+ * Check that [@rgbno,@len] is a valid extent range in @rtg.
+ *
+ * Must only be used for RTG-enabled file systems.
+ */
+static inline bool
+xfs_verify_rgbext(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t rgbno,
+ xfs_extlen_t len)
+{
+ ASSERT(xfs_has_rtgroups(rtg_mount(rtg)));
+
+ return xfs_verify_gbext(rtg_group(rtg), rgbno, len);
+}
+
+static inline xfs_rtblock_t
+xfs_rgbno_to_rtb(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t rgbno)
+{
+ return xfs_gbno_to_fsb(rtg_group(rtg), rgbno);
+}
+
+static inline xfs_rgnumber_t
+xfs_rtb_to_rgno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return xfs_fsb_to_gno(mp, rtbno, XG_TYPE_RTG);
+}
+
+static inline xfs_rgblock_t
+xfs_rtb_to_rgbno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return xfs_fsb_to_gbno(mp, rtbno, XG_TYPE_RTG);
+}
+
+/* Is rtbno the start of a RT group? */
+static inline bool
+xfs_rtbno_is_group_start(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return (rtbno & mp->m_groups[XG_TYPE_RTG].blkmask) == 0;
+}
+
+/* Convert an rtgroups rt extent number into an rgbno. */
+static inline xfs_rgblock_t
+xfs_rtx_to_rgbno(
+ struct xfs_rtgroup *rtg,
+ xfs_rtxnum_t rtx)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtx << mp->m_rtxblklog;
+ return rtx * mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_daddr_t
+xfs_rtb_to_daddr(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+
+ rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
+ }
+
+ return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+ struct xfs_mount *mp,
+ xfs_daddr_t daddr)
+{
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ xfs_rfsblock_t bno;
+
+ bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
+ xfs_rgnumber_t rgno;
+ uint32_t rgbno;
+
+ rgno = div_u64_rem(bno, g->blocks, &rgbno);
+ return ((xfs_rtblock_t)rgno << g->blklog) + rgbno;
+ }
+
+ return bno;
+}
+
+#ifdef CONFIG_XFS_RT
+int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents);
+void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+
+void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno);
+int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno, xfs_rtbxlen_t rextents);
+
+xfs_rtxnum_t __xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents);
+xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_rtgroup_calc_geometry(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgno, xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents);
+
+int xfs_update_last_rtgroup_size(struct xfs_mount *mp,
+ xfs_rgnumber_t prev_rgcount);
+
+/* Lock the rt bitmap inode in exclusive mode */
+#define XFS_RTGLOCK_BITMAP (1U << 0)
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1)
+/* Lock the rt rmap inode in exclusive mode */
+#define XFS_RTGLOCK_RMAP (1U << 2)
+/* Lock the rt refcount inode in exclusive mode */
+#define XFS_RTGLOCK_REFCOUNT (1U << 3)
+
+#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_BITMAP_SHARED | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
+
+void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
+void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
+void xfs_rtgroup_trans_join(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags);
+
+int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo);
+
+int xfs_rtginode_mkdir_parent(struct xfs_mount *mp);
+int xfs_rtginode_load_parent(struct xfs_trans *tp);
+
+const char *xfs_rtginode_name(enum xfs_rtg_inodes type);
+enum xfs_metafile_type xfs_rtginode_metafile_type(enum xfs_rtg_inodes type);
+bool xfs_rtginode_enabled(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type);
+void xfs_rtginode_mark_sick(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type);
+int xfs_rtginode_load(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type,
+ struct xfs_trans *tp);
+int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type,
+ bool init);
+void xfs_rtginode_irele(struct xfs_inode **ipp);
+
+void xfs_rtginode_irele(struct xfs_inode **ipp);
+
+static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno,
+ enum xfs_rtg_inodes type)
+{
+ return kasprintf(GFP_KERNEL, "%u.%s", rgno, xfs_rtginode_name(type));
+}
+
+void xfs_update_rtsb(struct xfs_buf *rtsb_bp,
+ const struct xfs_buf *sb_bp);
+struct xfs_buf *xfs_log_rtsb(struct xfs_trans *tp,
+ const struct xfs_buf *sb_bp);
+#else
+static inline void xfs_free_rtgroups(struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno)
+{
+}
+
+static inline int xfs_initialize_rtgroups(struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno,
+ xfs_rtbxlen_t rextents)
+{
+ return 0;
+}
+
+# define xfs_rtgroup_extents(mp, rgno) (0)
+# define xfs_update_last_rtgroup_size(mp, rgno) (0)
+# define xfs_rtgroup_lock(rtg, gf) ((void)0)
+# define xfs_rtgroup_unlock(rtg, gf) ((void)0)
+# define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0)
+# define xfs_update_rtsb(bp, sb_bp) ((void)0)
+# define xfs_log_rtsb(tp, sb_bp) (NULL)
+# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
new file mode 100644
index 000000000000..3db5e7a4a945
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
+#include "xfs_health.h"
+
+static struct kmem_cache *xfs_rtrefcountbt_cur_cache;
+
+/*
+ * Realtime Reference Count btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device. See the comments in xfs_refcount_btree.c for more information.
+ *
+ * This tree is basically the same as the regular refcount btree except that
+ * it's rooted in an inode.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+STATIC int
+xfs_rtrefcountbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_rtrefc_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrefcountbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0);
+ }
+
+ return cur->bc_mp->m_rtrefc_mxr[level != 0];
+}
+
+/*
+ * Calculate number of records in a realtime refcount btree inode root.
+ */
+unsigned int
+xfs_rtrefcountbt_droot_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= sizeof(struct xfs_rtrefcount_root);
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (2 * sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork so that
+ * we can resize the in-memory buffer to match it. After a resize to the
+ * maximum size this function returns the same value as
+ * xfs_rtrefcountbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrefcountbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_rtrefc_mxr[level != 0];
+ return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_rtrefcountbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->refc.rc_startblock);
+ x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+ key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
+ rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+ rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+static xfs_failaddr_t
+xfs_rtrefcountbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+ level = be16_to_cpu(block->bb_level);
+ if (level > mp->m_rtrefc_maxlevels)
+ return __this_address;
+
+ return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]);
+}
+
+static void
+xfs_rtrefcountbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_fsblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_rtrefcountbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrefcountbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtrefcountbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
+ .name = "xfs_rtrefcountbt",
+ .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) },
+ .verify_read = xfs_rtrefcountbt_read_verify,
+ .verify_write = xfs_rtrefcountbt_write_verify,
+ .verify_struct = xfs_rtrefcountbt_verify,
+};
+
+STATIC int
+xfs_rtrefcountbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->refc.rc_startblock) <
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_rtrefcountbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->refc.rc_startblock) +
+ be32_to_cpu(r1->refc.rc_blockcount) <=
+ be32_to_cpu(r2->refc.rc_startblock);
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrefcountbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+ be32_to_cpu(key2->refc.rc_startblock));
+}
+
+static inline void
+xfs_rtrefcountbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrefcountbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+ const unsigned int level = cur->bc_nlevels - 1;
+
+ new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just allocate
+ * it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need to
+ * realloc it and possibly move the node block pointers because
+ * those are not butted up against the btree block header.
+ */
+ old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level);
+ broot = xfs_broot_realloc(ifp, new_size);
+ if (level > 0)
+ xfs_rtrefcountbt_move_ptrs(mp, broot, old_size,
+ new_size, old_numrecs);
+ goto out_broot;
+ }
+
+ /*
+ * We're reducing numrecs. If we're going all the way to zero, just
+ * free the block.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0);
+ if (new_size == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ /*
+ * Shrink the btree root by possibly moving the rtrmapbt pointers,
+ * since they are not butted up against the btree block header. Then
+ * reallocate broot.
+ */
+ if (level > 0)
+ xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size,
+ new_size, new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+ ASSERT(xfs_rtrefcount_droot_space(broot) <=
+ xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+ return broot;
+}
+
+const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
+ .name = "rtrefcount",
+ .type = XFS_BTREE_TYPE_INODE,
+ .geom_flags = XFS_BTGEO_IROOT_RECORDS,
+
+ .rec_len = sizeof(struct xfs_refcount_rec),
+ .key_len = sizeof(struct xfs_refcount_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_REFC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2),
+ .sick_mask = XFS_SICK_RG_REFCNTBT,
+
+ .dup_cursor = xfs_rtrefcountbt_dup_cursor,
+ .alloc_block = xfs_btree_alloc_metafile_block,
+ .free_block = xfs_btree_free_metafile_block,
+ .get_minrecs = xfs_rtrefcountbt_get_minrecs,
+ .get_maxrecs = xfs_rtrefcountbt_get_maxrecs,
+ .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur,
+ .key_diff = xfs_rtrefcountbt_key_diff,
+ .buf_ops = &xfs_rtrefcountbt_buf_ops,
+ .diff_two_keys = xfs_rtrefcountbt_diff_two_keys,
+ .keys_inorder = xfs_rtrefcountbt_keys_inorder,
+ .recs_inorder = xfs_rtrefcountbt_recs_inorder,
+ .keys_contiguous = xfs_rtrefcountbt_keys_contiguous,
+ .broot_realloc = xfs_rtrefcountbt_broot_realloc,
+};
+
+/* Allocate a new rt refcount btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_inode *ip = rtg_refcount(rtg);
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops,
+ mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache);
+
+ cur->bc_ino.ip = ip;
+ cur->bc_refc.nr_ops = 0;
+ cur->bc_refc.shape_changes = 0;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+ cur->bc_ino.whichfork = XFS_DATA_FORK;
+ return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrefcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a realtime refcount btree block. */
+static inline unsigned int
+xfs_rtrefcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount btree block.
+ */
+unsigned int
+xfs_rtrefcountbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_RTREFCOUNT_BLOCK_LEN;
+ return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime refcount btrees. */
+unsigned int
+xfs_rtrefcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2;
+
+ /* We need at most one record for every block in an rt group. */
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrefcountbt_init_cur_cache(void)
+{
+ xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur",
+ xfs_btree_cur_sizeof(
+ xfs_rtrefcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rtrefcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rtrefcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rtrefcountbt_cur_cache);
+ xfs_rtrefcountbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of a realtime refcount btree. */
+void
+xfs_rtrefcountbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int d_maxlevels, r_maxlevels;
+
+ if (!xfs_has_rtreflink(mp)) {
+ mp->m_rtrefc_maxlevels = 0;
+ return;
+ }
+
+ /*
+ * The realtime refcountbt lives on the data device, which means that
+ * its maximum height is constrained by the size of the data device and
+ * the height required to store one refcount record for each rtextent
+ * in an rt group.
+ */
+ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr,
+ mp->m_sb.sb_dblocks);
+ r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr,
+ mp->m_sb.sb_rgextents);
+
+ /* Add one level to handle the inode root level. */
+ mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
+
+/* Calculate the rtrefcount btree size for some records. */
+unsigned long long
+xfs_rtrefcountbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_rtrefc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+static unsigned long long
+xfs_rtrefcountbt_max_size(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rtrefc_mxr[0] == 0)
+ return 0;
+
+ return xfs_rtrefcountbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ * We need enough space to hold one record for every rt extent in the rtgroup.
+ */
+xfs_filblks_t
+xfs_rtrefcountbt_calc_reserves(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents);
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+STATIC void
+xfs_rtrefcountbt_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_rtrefcount_root *dblock,
+ int dblocklen,
+ struct xfs_btree_block *rblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_refcount_key *fkp;
+ __be64 *fpp;
+ struct xfs_refcount_key *tkp;
+ __be64 *tpp;
+ struct xfs_refcount_rec *frp;
+ struct xfs_refcount_rec *trp;
+ unsigned int numrecs;
+ unsigned int maxrecs;
+ unsigned int rblocklen;
+
+ rblocklen = xfs_rtrefcount_broot_space(mp, dblock);
+
+ xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0,
+ ip->i_ino);
+
+ rblock->bb_level = dblock->bb_level;
+ rblock->bb_numrecs = dblock->bb_numrecs;
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+ tkp = xfs_rtrefcount_key_addr(rblock, 1);
+ fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+ tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+ trp = xfs_rtrefcount_rec_addr(rblock, 1);
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Load a realtime reference count btree root in from disk. */
+int
+xfs_iformat_rtrefcount(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ struct xfs_btree_block *broot;
+ unsigned int numrecs;
+ unsigned int level;
+ int dsize;
+
+ /*
+ * growfs must create the rtrefcount inodes before adding a realtime
+ * volume to the filesystem, so we cannot use the rtrefcount predicate
+ * here.
+ */
+ if (!xfs_has_reflink(ip->i_mount)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+ numrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > mp->m_rtrefc_maxlevels ||
+ xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+ xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
+ if (broot)
+ xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot);
+ return 0;
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_rtrefcountbt_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ int rblocklen,
+ struct xfs_rtrefcount_root *dblock,
+ int dblocklen)
+{
+ struct xfs_refcount_key *fkp;
+ __be64 *fpp;
+ struct xfs_refcount_key *tkp;
+ __be64 *tpp;
+ struct xfs_refcount_rec *frp;
+ struct xfs_refcount_rec *trp;
+ unsigned int maxrecs;
+ unsigned int numrecs;
+
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrefcount_key_addr(rblock, 1);
+ tkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+ fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrefcount_rec_addr(rblock, 1);
+ trp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Flush a realtime reference count btree root out to disk. */
+void
+xfs_iflush_rtrefcount(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(ifp->if_broot_bytes > 0);
+ ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, XFS_DATA_FORK));
+ xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot,
+ ifp->if_broot_bytes, dfp,
+ XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
+
+/*
+ * Create a realtime refcount btree inode.
+ */
+int
+xfs_rtrefcountbt_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *broot;
+
+ ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+ ASSERT(ifp->if_broot_bytes == 0);
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Initialize the empty incore btree root. */
+ broot = xfs_broot_realloc(ifp,
+ xfs_rtrefcount_broot_space_calc(mp, 0, 0));
+ if (broot)
+ xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0,
+ ip->i_ino);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
new file mode 100644
index 000000000000..a99b7a8aec86
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTREFCOUNT_BTREE_H__
+#define __XFS_RTREFCOUNT_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* refcounts only exist on crc enabled filesystems */
+#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake);
+void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp);
+unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
+ unsigned int blocklen, bool leaf);
+void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_rec_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_rec *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_key *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_ptr_addr(
+ struct xfs_btree_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrefcount_ptr_t *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ maxrecs * sizeof(struct xfs_refcount_key) +
+ (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
+int __init xfs_rtrefcountbt_init_cur_cache(void);
+void xfs_rtrefcountbt_destroy_cur_cache(void);
+
+xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
+unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */
+
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_droot_rec_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_rec *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_droot_key_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_key *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_droot_ptr_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrefcount_ptr_t *)
+ ((char *)(block + 1) +
+ maxrecs * sizeof(struct xfs_refcount_key) +
+ (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_broot_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *bb,
+ unsigned int index,
+ unsigned int block_size)
+{
+ return xfs_rtrefcount_ptr_addr(bb, index,
+ xfs_rtrefcountbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space_calc(
+ struct xfs_mount *mp,
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = XFS_RTREFCOUNT_BLOCK_LEN;
+
+ if (level > 0)
+ return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb)
+{
+ return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrefcount_droot_space_calc(
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = sizeof(struct xfs_rtrefcount_root);
+
+ if (level > 0)
+ return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrefcount_droot_space(struct xfs_btree_block *bb)
+{
+ return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
+ struct xfs_btree_block *rblock, int rblocklen,
+ struct xfs_rtrefcount_root *dblock, int dblocklen);
+void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+
+int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+
+#endif /* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
new file mode 100644
index 000000000000..9bdc2cbfc113
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -0,0 +1,1054 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_metafile.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_bmap.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+
+static struct kmem_cache *xfs_rtrmapbt_cur_cache;
+
+/*
+ * Realtime Reverse Map btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device. See the comments in xfs_rmap_btree.c for more information.
+ *
+ * This tree is basically the same as the regular rmap btree except that it
+ * is rooted in an inode and does not live in free space.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrmapbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+STATIC int
+xfs_rtrmapbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_rtrmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrmapbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0);
+ }
+
+ return cur->bc_mp->m_rtrmap_mxr[level != 0];
+}
+
+/* Calculate number of records in the ondisk realtime rmap btree inode root. */
+unsigned int
+xfs_rtrmapbt_droot_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= sizeof(struct xfs_rtrmap_root);
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen / (2 * sizeof(struct xfs_rmap_key) +
+ sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it. After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_rtrmapbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrmapbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_rtrmap_mxr[level != 0];
+ return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+ return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
+STATIC void
+xfs_rtrmapbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+}
+
+STATIC void
+xfs_rtrmapbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ uint64_t off;
+ int adj;
+
+ adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ be32_add_cpu(&key->rmap.rm_startblock, adj);
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
+ if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+ XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+ return;
+ off = be64_to_cpu(key->rmap.rm_offset);
+ off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+ key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rtrmapbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+ rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+ rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+ rec->rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rtrmapbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+ return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ const struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ int64_t d;
+
+ d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp->rm_owner);
+ y = rec->rm_owner;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = offset_keymask(be64_to_cpu(kp->rm_offset));
+ y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+STATIC int64_t
+xfs_rtrmapbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct xfs_rmap_key *kp1 = &k1->rmap;
+ const struct xfs_rmap_key *kp2 = &k2->rmap;
+ int64_t d;
+ __u64 x, y;
+
+ /* Doesn't make sense to mask off the physical space part */
+ ASSERT(!mask || mask->rmap.rm_startblock);
+
+ d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
+ be32_to_cpu(kp2->rm_startblock);
+ if (d)
+ return d;
+
+ if (!mask || mask->rmap.rm_owner) {
+ x = be64_to_cpu(kp1->rm_owner);
+ y = be64_to_cpu(kp2->rm_owner);
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ }
+
+ if (!mask || mask->rmap.rm_offset) {
+ /* Doesn't make sense to allow offset but not owner */
+ ASSERT(!mask || mask->rmap.rm_owner);
+
+ x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+ y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ }
+
+ return 0;
+}
+
+static xfs_failaddr_t
+xfs_rtrmapbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_rmapbt(mp))
+ return __this_address;
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+ level = be16_to_cpu(block->bb_level);
+ if (level > mp->m_rtrmap_maxlevels)
+ return __this_address;
+
+ return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
+}
+
+static void
+xfs_rtrmapbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_fsblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_rtrmapbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrmapbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtrmapbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
+ .name = "xfs_rtrmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+ .verify_read = xfs_rtrmapbt_read_verify,
+ .verify_write = xfs_rtrmapbt_write_verify,
+ .verify_struct = xfs_rtrmapbt_verify,
+};
+
+STATIC int
+xfs_rtrmapbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
+
+ x = be32_to_cpu(k1->rmap.rm_startblock);
+ y = be32_to_cpu(k2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(k1->rmap.rm_owner);
+ b = be64_to_cpu(k2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+ b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC int
+xfs_rtrmapbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ uint32_t x;
+ uint32_t y;
+ uint64_t a;
+ uint64_t b;
+
+ x = be32_to_cpu(r1->rmap.rm_startblock);
+ y = be32_to_cpu(r2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(r1->rmap.rm_owner);
+ b = be64_to_cpu(r2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+ b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrmapbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->rmap.rm_startblock);
+
+ /*
+ * We only support checking contiguity of the physical space component.
+ * If any callers ever need more specificity than that, they'll have to
+ * implement it here.
+ */
+ ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+ return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+ be32_to_cpu(key2->rmap.rm_startblock));
+}
+
+static inline void
+xfs_rtrmapbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrmapbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+ const unsigned int level = cur->bc_nlevels - 1;
+
+ new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just allocate
+ * it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need to
+ * realloc it and possibly move the node block pointers because
+ * those are not butted up against the btree block header.
+ */
+ old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0);
+ broot = xfs_broot_realloc(ifp, new_size);
+ if (level > 0)
+ xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size,
+ old_numrecs);
+ goto out_broot;
+ }
+
+ /*
+ * We're reducing numrecs. If we're going all the way to zero, just
+ * free the block.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0);
+ if (new_size == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ /*
+ * Shrink the btree root by possibly moving the rtrmapbt pointers,
+ * since they are not butted up against the btree block header. Then
+ * reallocate broot.
+ */
+ if (level > 0)
+ xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size,
+ new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+ ASSERT(xfs_rtrmap_droot_space(broot) <=
+ xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+ return broot;
+}
+
+const struct xfs_btree_ops xfs_rtrmapbt_ops = {
+ .name = "rtrmap",
+ .type = XFS_BTREE_TYPE_INODE,
+ .geom_flags = XFS_BTGEO_OVERLAPPING |
+ XFS_BTGEO_IROOT_RECORDS,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2),
+ .sick_mask = XFS_SICK_RG_RMAPBT,
+
+ .dup_cursor = xfs_rtrmapbt_dup_cursor,
+ .alloc_block = xfs_btree_alloc_metafile_block,
+ .free_block = xfs_btree_free_metafile_block,
+ .get_minrecs = xfs_rtrmapbt_get_minrecs,
+ .get_maxrecs = xfs_rtrmapbt_get_maxrecs,
+ .get_dmaxrecs = xfs_rtrmapbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur,
+ .key_diff = xfs_rtrmapbt_key_diff,
+ .buf_ops = &xfs_rtrmapbt_buf_ops,
+ .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .keys_inorder = xfs_rtrmapbt_keys_inorder,
+ .recs_inorder = xfs_rtrmapbt_recs_inorder,
+ .keys_contiguous = xfs_rtrmapbt_keys_contiguous,
+ .broot_realloc = xfs_rtrmapbt_broot_realloc,
+};
+
+/* Allocate a new rt rmap btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_inode *ip = rtg_rmap(rtg);
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
+ mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
+
+ cur->bc_ino.ip = ip;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ cur->bc_ino.whichfork = XFS_DATA_FORK;
+ cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+
+ return cur;
+}
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+/*
+ * Validate an in-memory realtime rmap btree block. Callers are allowed to
+ * generate an in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rtrmapbt_mem_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (xfs_has_rmapbt(mp)) {
+ if (level >= mp->m_rtrmap_maxlevels)
+ return __this_address;
+ } else {
+ if (level >= xfs_rtrmapbt_maxlevels_ondisk())
+ return __this_address;
+ }
+
+ maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rtrmapbt_mem_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = xfs_rtrmapbt_mem_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
+ .name = "xfs_rtrmapbt_mem",
+ .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
+ .verify_read = xfs_rtrmapbt_mem_rw_verify,
+ .verify_write = xfs_rtrmapbt_mem_rw_verify,
+ .verify_struct = xfs_rtrmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
+ .type = XFS_BTREE_TYPE_MEM,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = xfs_rtrmapbt_key_diff,
+ .buf_ops = &xfs_rtrmapbt_mem_buf_ops,
+ .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .keys_inorder = xfs_rtrmapbt_keys_inorder,
+ .recs_inorder = xfs_rtrmapbt_recs_inorder,
+ .keys_contiguous = xfs_rtrmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+xfs_rtrmapbt_mem_cursor(
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbt)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops,
+ mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
+ cur->bc_mem.xfbtree = xfbt;
+ cur->bc_nlevels = xfbt->nlevels;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ return cur;
+}
+
+/* Create an in-memory realtime rmap btree. */
+int
+xfs_rtrmapbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ xfs_rgnumber_t rgno)
+{
+ xfbt->owner = rgno;
+ return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops);
+}
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+/*
+ * Install a new rt reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrmapbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a rt reverse mapping btree block. */
+static inline unsigned int
+xfs_rtrmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Calculate number of records in an rt reverse mapping btree block.
+ */
+unsigned int
+xfs_rtrmapbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_RTRMAP_BLOCK_LEN;
+ return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime reverse mapping btrees. */
+unsigned int
+xfs_rtrmapbt_maxlevels_ondisk(void)
+{
+ unsigned long long max_dblocks;
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
+ *
+ * On a reflink filesystem, each block in an rtgroup can have up to
+ * 2^32 (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records. However, we're
+ * likely to run out of blocks in the data device long before that
+ * happens, which means that we must compute the max height based on
+ * what the btree will look like if it consumes almost all the blocks
+ * in the data device due to maximal sharing factor.
+ */
+ max_dblocks = -1U; /* max ag count */
+ max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
+ return xfs_btree_space_to_height(minrecs, max_dblocks);
+}
+
+int __init
+xfs_rtrmapbt_init_cur_cache(void)
+{
+ xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rtrmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rtrmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
+ xfs_rtrmapbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of an rt reverse mapping btree. */
+void
+xfs_rtrmapbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int d_maxlevels, r_maxlevels;
+
+ if (!xfs_has_rtrmapbt(mp)) {
+ mp->m_rtrmap_maxlevels = 0;
+ return;
+ }
+
+ /*
+ * The realtime rmapbt lives on the data device, which means that its
+ * maximum height is constrained by the size of the data device and
+ * the height required to store one rmap record for each block in an
+ * rt group.
+ *
+ * On a reflink filesystem, each rt block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. This makes the computation of
+ * maxlevels based on record count meaningless, so we only consider the
+ * size of the data device.
+ */
+ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
+ mp->m_sb.sb_dblocks);
+ if (xfs_has_rtreflink(mp)) {
+ mp->m_rtrmap_maxlevels = d_maxlevels + 1;
+ return;
+ }
+
+ r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
+ mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /* Add one level to handle the inode root level. */
+ mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
+
+/* Calculate the rtrmap btree size for some records. */
+unsigned long long
+xfs_rtrmapbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum rmap btree size.
+ */
+static unsigned long long
+xfs_rtrmapbt_max_size(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rtrmap_mxr[0] == 0)
+ return 0;
+
+ return xfs_rtrmapbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+xfs_filblks_t
+xfs_rtrmapbt_calc_reserves(
+ struct xfs_mount *mp)
+{
+ uint32_t blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+
+ if (!xfs_has_rtrmapbt(mp))
+ return 0;
+
+ /* Reserve 1% of the rtgroup or enough for 1 block per record. */
+ return max_t(xfs_filblks_t, blocks / 100,
+ xfs_rtrmapbt_max_size(mp, blocks));
+}
+
+/* Convert on-disk form of btree root to in-memory form. */
+STATIC void
+xfs_rtrmapbt_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_rtrmap_root *dblock,
+ unsigned int dblocklen,
+ struct xfs_btree_block *rblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rmap_key *fkp;
+ __be64 *fpp;
+ struct xfs_rmap_key *tkp;
+ __be64 *tpp;
+ struct xfs_rmap_rec *frp;
+ struct xfs_rmap_rec *trp;
+ unsigned int rblocklen = xfs_rtrmap_broot_space(mp, dblock);
+ unsigned int numrecs;
+ unsigned int maxrecs;
+
+ xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
+
+ rblock->bb_level = dblock->bb_level;
+ rblock->bb_numrecs = dblock->bb_numrecs;
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+ tkp = xfs_rtrmap_key_addr(rblock, 1);
+ fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+ tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+ trp = xfs_rtrmap_rec_addr(rblock, 1);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Load a realtime reverse mapping btree root in from disk. */
+int
+xfs_iformat_rtrmap(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ struct xfs_btree_block *broot;
+ unsigned int numrecs;
+ unsigned int level;
+ int dsize;
+
+ /*
+ * growfs must create the rtrmap inodes before adding a realtime volume
+ * to the filesystem, so we cannot use the rtrmapbt predicate here.
+ */
+ if (!xfs_has_rmapbt(ip->i_mount)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+ numrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > mp->m_rtrmap_maxlevels ||
+ xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+ xfs_rtrmap_broot_space_calc(mp, level, numrecs));
+ if (broot)
+ xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot);
+ return 0;
+}
+
+/* Convert in-memory form of btree root to on-disk form. */
+void
+xfs_rtrmapbt_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ unsigned int rblocklen,
+ struct xfs_rtrmap_root *dblock,
+ unsigned int dblocklen)
+{
+ struct xfs_rmap_key *fkp;
+ __be64 *fpp;
+ struct xfs_rmap_key *tkp;
+ __be64 *tpp;
+ struct xfs_rmap_rec *frp;
+ struct xfs_rmap_rec *trp;
+ unsigned int numrecs;
+ unsigned int maxrecs;
+
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrmap_key_addr(rblock, 1);
+ tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
+ fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrmap_rec_addr(rblock, 1);
+ trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Flush a realtime reverse mapping btree root out to disk. */
+void
+xfs_iflush_rtrmap(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(ifp->if_broot_bytes > 0);
+ ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, XFS_DATA_FORK));
+ xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
+ dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
+
+/*
+ * Create a realtime rmap btree inode.
+ */
+int
+xfs_rtrmapbt_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *broot;
+
+ ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+ ASSERT(ifp->if_broot_bytes == 0);
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Initialize the empty incore btree root. */
+ broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0));
+ if (broot)
+ xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0,
+ ip->i_ino);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+
+ return 0;
+}
+
+/*
+ * Initialize an rmap for a realtime superblock using the potentially updated
+ * rt geometry in the provided @mp.
+ */
+int
+xfs_rtrmapbt_init_rtsb(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_blockcount = mp->m_sb.sb_rextsize,
+ .rm_owner = XFS_RMAP_OWN_FS,
+ };
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(xfs_has_rtsb(mp));
+ ASSERT(rtg_rgno(rtg) == 0);
+
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_map_raw(cur, &rmap);
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Return the highest rgbno currently tracked by the rmap for this rtg.
+ */
+xfs_rgblock_t
+xfs_rtrmap_highest_rgbno(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
+ union xfs_btree_key key = {};
+ struct xfs_btree_cur *cur;
+
+ if (block->bb_numrecs == 0)
+ return NULLRGBLOCK;
+ cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+ xfs_btree_get_keys(cur, block, &key);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
new file mode 100644
index 000000000000..e328fd62a149
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTRMAP_BTREE_H__
+#define __XFS_RTRMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+struct xfbtree;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RTRMAP_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake);
+void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp);
+unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrmapbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_rec_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_rec *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_key *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_high_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_key *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ sizeof(struct xfs_rmap_key) +
+ (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_ptr_addr(
+ struct xfs_btree_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrmap_ptr_t *)
+ ((char *)block + XFS_RTRMAP_BLOCK_LEN +
+ maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+ (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+unsigned int xfs_rtrmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rtrmapbt_init_cur_cache(void);
+void xfs_rtrmapbt_destroy_cur_cache(void);
+
+xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp);
+
+/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */
+
+static inline struct xfs_rmap_rec *
+xfs_rtrmap_droot_rec_addr(
+ struct xfs_rtrmap_root *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_rec *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_rmap_rec));
+}
+
+static inline struct xfs_rmap_key *
+xfs_rtrmap_droot_key_addr(
+ struct xfs_rtrmap_root *block,
+ unsigned int index)
+{
+ return (struct xfs_rmap_key *)
+ ((char *)(block + 1) +
+ (index - 1) * 2 * sizeof(struct xfs_rmap_key));
+}
+
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_droot_ptr_addr(
+ struct xfs_rtrmap_root *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrmap_ptr_t *)
+ ((char *)(block + 1) +
+ maxrecs * 2 * sizeof(struct xfs_rmap_key) +
+ (index - 1) * sizeof(xfs_rtrmap_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrmap_ptr_t *
+xfs_rtrmap_broot_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *bb,
+ unsigned int index,
+ unsigned int block_size)
+{
+ return xfs_rtrmap_ptr_addr(bb, index,
+ xfs_rtrmapbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrmap_broot_space_calc(
+ struct xfs_mount *mp,
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = XFS_RTRMAP_BLOCK_LEN;
+
+ if (level > 0)
+ return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+ sizeof(xfs_rtrmap_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb)
+{
+ return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrmap_droot_space_calc(
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = sizeof(struct xfs_rtrmap_root);
+
+ if (level > 0)
+ return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) +
+ sizeof(xfs_rtrmap_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_rmap_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrmap_droot_space(struct xfs_btree_block *bb)
+{
+ return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock,
+ unsigned int rblocklen, struct xfs_rtrmap_root *dblock,
+ unsigned int dblocklen);
+void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip);
+
+int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp);
+
+unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
+
+xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg);
+
+#endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d95409f3cba6..711e180f9ebb 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -27,6 +27,9 @@
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
#include "xfs_exchrange.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -180,6 +183,10 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_EXCHANGE_RANGE;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
features |= XFS_FEAT_PARENT;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+ features |= XFS_FEAT_METADIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
+ features |= XFS_FEAT_ZONED;
return features;
}
@@ -232,11 +239,40 @@ xfs_validate_sb_read(
return 0;
}
+/* Return the number of extents covered by a single rt bitmap file */
+static xfs_rtbxlen_t
+xfs_extents_per_rbm(
+ struct xfs_sb *sbp)
+{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+ return sbp->sb_rgextents;
+ return sbp->sb_rextents;
+}
+
+/*
+ * Return the payload size of a single rt bitmap block (without the metadata
+ * header if any).
+ */
+static inline unsigned int
+xfs_rtbmblock_size(
+ struct xfs_sb *sbp)
+{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+ return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo);
+ return sbp->sb_blocksize;
+}
+
static uint64_t
-xfs_sb_calc_rbmblocks(
+xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
- return howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize);
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+ return 0;
+ return howmany_64(xfs_extents_per_rbm(sbp),
+ NBBY * xfs_rtbmblock_size(sbp));
}
/* Validate the realtime geometry */
@@ -244,9 +280,15 @@ bool
xfs_validate_rt_geometry(
struct xfs_sb *sbp)
{
- if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
- sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
- return false;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
+ if (sbp->sb_rextsize != 1)
+ return false;
+ } else {
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
+ return false;
+ }
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
@@ -258,7 +300,7 @@ xfs_validate_rt_geometry(
if (sbp->sb_rextents == 0 ||
sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) ||
sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) ||
- sbp->sb_rbmblocks != xfs_sb_calc_rbmblocks(sbp))
+ sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp))
return false;
return true;
@@ -297,13 +339,6 @@ xfs_validate_sb_write(
* the kernel cannot support since we checked for unsupported bits in
* the read verifier, which means that memory is corrupt.
*/
- if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) {
- xfs_warn(mp,
-"Corruption detected in superblock compatible features (0x%x)!",
- (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN));
- return -EFSCORRUPTED;
- }
-
if (!xfs_is_readonly(mp) &&
xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
xfs_alert(mp,
@@ -339,6 +374,106 @@ xfs_validate_sb_write(
return 0;
}
+int
+xfs_compute_rgblklog(
+ xfs_rtxlen_t rgextents,
+ xfs_rgblock_t rextsize)
+{
+ uint64_t rgblocks = (uint64_t)rgextents * rextsize;
+
+ return xfs_highbit64(rgblocks - 1) + 1;
+}
+
+static int
+xfs_validate_sb_rtgroups(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ uint64_t groups;
+ int rgblklog;
+
+ if (sbp->sb_rextsize == 0) {
+ xfs_warn(mp,
+"Realtime extent size must not be zero.");
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) {
+ xfs_warn(mp,
+"Realtime group size (%u) must be less than %u rt extents.",
+ sbp->sb_rgextents,
+ XFS_MAX_RGBLOCKS / sbp->sb_rextsize);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) {
+ xfs_warn(mp,
+"Realtime group size (%u) must be at least %u rt extents.",
+ sbp->sb_rgextents, XFS_MIN_RGEXTENTS);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) {
+ xfs_warn(mp,
+"Realtime groups (%u) must be less than %u.",
+ sbp->sb_rgcount, XFS_MAX_RGNUMBER);
+ return -EINVAL;
+ }
+
+ groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents);
+ if (groups != sbp->sb_rgcount) {
+ xfs_warn(mp,
+"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.",
+ sbp->sb_rgcount, groups);
+ return -EINVAL;
+ }
+
+ /* Exchange-range is required for fsr to work on realtime files */
+ if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) {
+ xfs_warn(mp,
+"Realtime groups feature requires exchange-range support.");
+ return -EINVAL;
+ }
+
+ rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize);
+ if (sbp->sb_rgblklog != rgblklog) {
+ xfs_warn(mp,
+"Realtime group log (%d) does not match expected value (%d).",
+ sbp->sb_rgblklog, rgblklog);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+xfs_validate_sb_zoned(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ if (sbp->sb_frextents != 0) {
+ xfs_warn(mp,
+"sb_frextents must be zero for zoned file systems.");
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
+ xfs_warn(mp,
+"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
+ sbp->sb_rtstart, sbp->sb_dblocks);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
+ xfs_warn(mp,
+"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
+ sbp->sb_rtreserved, sbp->sb_rblocks);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
@@ -350,6 +485,7 @@ xfs_validate_sb_common(
uint32_t agcount = 0;
uint32_t rem;
bool has_dalign;
+ int error;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp,
@@ -398,6 +534,38 @@ xfs_validate_sb_common(
sbp->sb_inoalignmt, align);
return -EINVAL;
}
+
+ if (sbp->sb_spino_align &&
+ (sbp->sb_spino_align > sbp->sb_inoalignmt ||
+ (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) {
+ xfs_warn(mp,
+"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).",
+ sbp->sb_spino_align,
+ sbp->sb_inoalignmt);
+ return -EINVAL;
+ }
+ } else if (sbp->sb_spino_align) {
+ xfs_warn(mp,
+ "Sparse inode alignment (%u) should be zero.",
+ sbp->sb_spino_align);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+ if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) {
+ xfs_warn(mp,
+"Metadir superblock padding fields must be zero.");
+ return -EINVAL;
+ }
+
+ error = xfs_validate_sb_rtgroups(mp, sbp);
+ if (error)
+ return error;
+ }
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ error = xfs_validate_sb_zoned(mp, sbp);
+ if (error)
+ return error;
}
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
@@ -566,6 +734,14 @@ xfs_validate_sb_common(
void
xfs_sb_quota_from_disk(struct xfs_sb *sbp)
{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+ sbp->sb_uquotino = NULLFSINO;
+ sbp->sb_gquotino = NULLFSINO;
+ sbp->sb_pquotino = NULLFSINO;
+ return;
+ }
+
/*
* older mkfs doesn't initialize quota inodes to NULLFSINO. This
* leads to in-core values having two different values for a quota
@@ -689,6 +865,28 @@ __xfs_sb_from_disk(
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+ to->sb_metadirino = be64_to_cpu(from->sb_metadirino);
+ to->sb_rgblklog = from->sb_rgblklog;
+ memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad));
+ to->sb_rgcount = be32_to_cpu(from->sb_rgcount);
+ to->sb_rgextents = be32_to_cpu(from->sb_rgextents);
+ to->sb_rbmino = NULLFSINO;
+ to->sb_rsumino = NULLFSINO;
+ } else {
+ to->sb_metadirino = NULLFSINO;
+ to->sb_rgcount = 1;
+ to->sb_rgextents = 0;
+ }
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
+ to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
+ } else {
+ to->sb_rtstart = 0;
+ to->sb_rtreserved = 0;
+ }
}
void
@@ -706,6 +904,15 @@ xfs_sb_quota_to_disk(
{
uint16_t qflags = from->sb_qflags;
+ if (xfs_sb_is_v5(from) &&
+ (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+ to->sb_qflags = cpu_to_be16(from->sb_qflags);
+ to->sb_uquotino = cpu_to_be64(0);
+ to->sb_gquotino = cpu_to_be64(0);
+ to->sb_pquotino = cpu_to_be64(0);
+ return;
+ }
+
to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
/*
@@ -836,6 +1043,21 @@ xfs_sb_to_disk(
to->sb_lsn = cpu_to_be64(from->sb_lsn);
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+ to->sb_metadirino = cpu_to_be64(from->sb_metadirino);
+ to->sb_rgblklog = from->sb_rgblklog;
+ memset(to->sb_pad, 0, sizeof(to->sb_pad));
+ to->sb_rgcount = cpu_to_be32(from->sb_rgcount);
+ to->sb_rgextents = cpu_to_be32(from->sb_rgextents);
+ to->sb_rbmino = cpu_to_be64(0);
+ to->sb_rsumino = cpu_to_be64(0);
+ }
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
+ to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
+ }
}
/*
@@ -965,13 +1187,47 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.verify_write = xfs_sb_write_verify,
};
+/* Compute cached rt geometry from the incore sb. */
void
-xfs_mount_sb_set_rextsize(
+xfs_sb_mount_rextsize(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
+ struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
+
mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+ rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
+ rgs->blklog = mp->m_sb.sb_rgblklog;
+ rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
+ rgs->start_fsb = mp->m_sb.sb_rtstart;
+ if (xfs_sb_has_incompat_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
+ rgs->has_daddr_gaps = true;
+ } else {
+ rgs->blocks = 0;
+ rgs->blklog = 0;
+ rgs->blkmask = (uint64_t)-1;
+ }
+}
+
+/* Update incore sb rt extent size, then recompute the cached rt geometry. */
+void
+xfs_mount_sb_set_rextsize(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp,
+ xfs_agblock_t rextsize)
+{
+ sbp->sb_rextsize = rextsize;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+ sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents,
+ rextsize);
+
+ xfs_sb_mount_rextsize(mp, sbp);
}
/*
@@ -988,6 +1244,8 @@ xfs_sb_mount_common(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
+ struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG];
+
mp->m_agfrotor = 0;
atomic_set(&mp->m_agirotor, 0);
mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -996,9 +1254,14 @@ xfs_sb_mount_common(
mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
mp->m_blockmask = sbp->sb_blocksize - 1;
- mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
- mp->m_blockwmask = mp->m_blockwsize - 1;
- xfs_mount_sb_set_rextsize(mp, sbp);
+ mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG;
+ mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG;
+
+ ags->blocks = mp->m_sb.sb_agblocks;
+ ags->blklog = mp->m_sb.sb_agblklog;
+ ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog);
+
+ xfs_sb_mount_rextsize(mp, sbp);
mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true);
mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false);
@@ -1015,11 +1278,23 @@ xfs_sb_mount_common(
mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+ mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+ mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false);
+ mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2;
+ mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2;
+
mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+ mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+ true);
+ mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+ false);
+ mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
+ mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
@@ -1045,19 +1320,24 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
- *
- * Do not update sb_frextents here because it is not part of the lazy
- * sb counters, despite having a percpu counter. It is always kept
- * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
- * and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
mp->m_sb.sb_ifree = min_t(uint64_t,
percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks =
- percpu_counter_sum_positive(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
+ }
+
+ /*
+ * sb_frextents was added to the lazy sb counters when the rt groups
+ * feature was introduced. This counter can go negative due to the way
+ * we handle nearly-lockless reservations, so we must use the _positive
+ * variant here to avoid writing out nonsense frextents.
+ */
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
+ mp->m_sb.sb_frextents =
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
@@ -1109,18 +1389,17 @@ int
xfs_update_secondary_sbs(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno = 1;
+ struct xfs_perag *pag = NULL;
int saved_error = 0;
int error = 0;
LIST_HEAD (buffer_list);
/* update secondary superblocks. */
- for_each_perag_from(mp, agno, pag) {
+ while ((pag = xfs_perag_next_from(mp, pag, 1))) {
struct xfs_buf *bp;
error = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR),
XFS_FSS_TO_BB(mp, 1), &bp);
/*
* If we get an error reading or writing alternate superblocks,
@@ -1132,7 +1411,7 @@ xfs_update_secondary_sbs(
if (error) {
xfs_warn(mp,
"error allocating secondary superblock for ag %d",
- pag->pag_agno);
+ pag_agno(pag));
if (!saved_error)
saved_error = error;
continue;
@@ -1146,26 +1425,22 @@ xfs_update_secondary_sbs(
xfs_buf_relse(bp);
/* don't hold too many buffers at once */
- if (agno % 16)
+ if (pag_agno(pag) % 16)
continue;
error = xfs_buf_delwri_submit(&buffer_list);
if (error) {
xfs_warn(mp,
"write error %d updating a secondary superblock near ag %d",
- error, pag->pag_agno);
+ error, pag_agno(pag));
if (!saved_error)
saved_error = error;
continue;
}
}
error = xfs_buf_delwri_submit(&buffer_list);
- if (error) {
- xfs_warn(mp,
- "write error %d updating a secondary superblock near ag %d",
- error, agno);
- }
-
+ if (error)
+ xfs_warn(mp, "error %d writing secondary superblocks", error);
return saved_error ? saved_error : error;
}
@@ -1175,10 +1450,12 @@ xfs_update_secondary_sbs(
*/
int
xfs_sync_sb_buf(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool update_rtsb)
{
struct xfs_trans *tp;
struct xfs_buf *bp;
+ struct xfs_buf *rtsb_bp = NULL;
int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
@@ -1188,6 +1465,11 @@ xfs_sync_sb_buf(
bp = xfs_trans_getsb(tp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
+ if (update_rtsb) {
+ rtsb_bp = xfs_log_rtsb(tp, bp);
+ if (rtsb_bp)
+ xfs_trans_bhold(tp, rtsb_bp);
+ }
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
if (error)
@@ -1196,7 +1478,11 @@ xfs_sync_sb_buf(
* write out the sb buffer to get the changes to disk
*/
error = xfs_bwrite(bp);
+ if (!error && rtsb_bp)
+ error = xfs_bwrite(rtsb_bp);
out:
+ if (rtsb_bp)
+ xfs_buf_relse(rtsb_bp);
xfs_buf_relse(bp);
return error;
}
@@ -1283,6 +1569,10 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
if (xfs_has_exchange_range(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
+ if (xfs_has_metadir(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
+ if (xfs_has_zoned(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@@ -1298,6 +1588,15 @@ xfs_fs_geometry(
return;
geo->version = XFS_FSOP_GEOM_VERSION_V5;
+
+ if (xfs_has_rtgroups(mp)) {
+ geo->rgcount = sbp->sb_rgcount;
+ geo->rgextents = sbp->sb_rgextents;
+ }
+ if (xfs_has_zoned(mp)) {
+ geo->rtstart = sbp->sb_rtstart;
+ geo->rtreserved = sbp->sb_rtreserved;
+ }
}
/* Read a secondary superblock. */
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 885c83755991..34d0dd374e9b 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -15,10 +15,11 @@ struct xfs_perag;
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
-extern int xfs_sync_sb_buf(struct xfs_mount *mp);
+extern int xfs_sync_sb_buf(struct xfs_mount *mp, bool update_rtsb);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
+void xfs_sb_mount_rextsize(struct xfs_mount *mp, struct xfs_sb *sbp);
void xfs_mount_sb_set_rextsize(struct xfs_mount *mp,
- struct xfs_sb *sbp);
+ struct xfs_sb *sbp, xfs_agblock_t rextsize);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
@@ -43,5 +44,6 @@ bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
bool xfs_validate_rt_geometry(struct xfs_sb *sbp);
uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+int xfs_compute_rgblklog(xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize);
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 33b84a3a83ff..b1e0d9bc1f7d 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -38,7 +38,12 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
+extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
+extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
@@ -52,6 +57,9 @@ extern const struct xfs_btree_ops xfs_bmbt_ops;
extern const struct xfs_btree_ops xfs_refcountbt_ops;
extern const struct xfs_btree_ops xfs_rmapbt_ops;
extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
+extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrefcountbt_ops;
static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
{
@@ -93,10 +101,26 @@ static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
{
return ops == &xfs_rmapbt_mem_ops;
}
+
+static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrmapbt_mem_ops;
+}
#else
# define xfs_btree_is_mem_rmap(...) (false)
+# define xfs_btree_is_mem_rtrmap(...) (false)
#endif
+static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrmapbt_ops;
+}
+
+static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrefcountbt_ops;
+}
+
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -157,6 +181,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_TRANS_SB_RBLOCKS 0x00000800
#define XFS_TRANS_SB_REXTENTS 0x00001000
#define XFS_TRANS_SB_REXTSLOG 0x00002000
+#define XFS_TRANS_SB_RGCOUNT 0x00004000
/*
* Here we centralize the specification of XFS meta-data buffer reference count
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index f228127a88ff..fb47a76ead18 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -92,8 +92,10 @@ xfs_symlink_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+ /* no verification of non-crc buffers */
if (!xfs_has_crc(mp))
- return __this_address;
+ return NULL;
+
if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 3c40f37e82c7..c962ad64b0c1 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- tv = current_time(inode);
+ /* If the mtime changes, then ctime must also change */
+ ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode_set_mtime_to_ts(inode, tv);
- if (flags & XFS_ICHGTIME_CHG)
- inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_ACCESS)
inode_set_atime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 1a7f95bcf069..86a111d0f2fc 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,6 +22,12 @@
#include "xfs_rtbitmap.h"
#include "xfs_attr_item.h"
#include "xfs_log.h"
+#include "xfs_defer.h"
+#include "xfs_bmap_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_trace.h"
#define _ALLOC true
#define _FREE false
@@ -92,6 +98,14 @@ xfs_refcountbt_block_count(
return num_ops * (2 * mp->m_refc_maxlevels - 1);
}
+static unsigned int
+xfs_rtrefcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_rtrefc_maxlevels - 1);
+}
+
/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
@@ -213,7 +227,9 @@ xfs_calc_inode_chunk_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
* blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
- * extents, as well as the realtime summary block.
+ * extents, as well as the realtime summary block (t1). Realtime rmap btree
+ * operations happen in a second transaction, so factor in a couple of rtrmapbt
+ * splits (t2).
*/
static unsigned int
xfs_rtalloc_block_count(
@@ -222,10 +238,16 @@ xfs_rtalloc_block_count(
{
unsigned int rtbmp_blocks;
xfs_rtxlen_t rtxlen;
+ unsigned int t1, t2 = 0;
rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
- rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
- return (rtbmp_blocks + 1) * num_ops;
+ rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen);
+ t1 = (rtbmp_blocks + 1) * num_ops;
+
+ if (xfs_has_rmapbt(mp))
+ t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1);
+
+ return max(t1, t2);
}
/*
@@ -248,26 +270,64 @@ xfs_rtalloc_block_count(
*/
/*
+ * Finishing a data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
*
- * This is calculated as:
+ * This is calculated as the max of:
* Data device refcount updates (t1):
* the agfs of the ags containing the blocks: nr_ops * sector size
* the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
static unsigned int
xfs_calc_refcountbt_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+ unsigned int t1, t2;
- if (!xfs_has_reflink(mp))
- return 0;
+ t1 = xfs_calc_finish_cui_reservation(mp, nr_ops);
+ t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops);
- return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+ return max(t1, t2);
}
/*
@@ -353,6 +413,96 @@ xfs_calc_write_reservation_minlogsize(
}
/*
+ * Finishing an EFI can free the blocks and bmap blocks (t2):
+ * the agf for each of the ags: nr * sector size
+ * the agfl for each of the ags: nr * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming nr extents:
+ * nr exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Or, if it's a realtime file (t3):
+ * the agf for each of the ags: 2 * sector size
+ * the agfl for each of the ags: 2 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 2 exts * 1 block
+ * worst case split in allocation btrees per extent assuming 2 extents:
+ * 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr),
+ mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+ return xfs_calc_finish_efi_reservation(mp, nr);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rt_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rtrmapbt(mp))
+ return 0;
+ return xfs_calc_finish_rt_efi_reservation(mp, nr);
+}
+
+/*
+ * In finishing a BUI, we can modify:
+ * the inode being truncated: inode size
+ * dquots
+ * the inode's bmap btree: (max depth + 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_bui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+ mp->m_sb.sb_blocksize);
+}
+
+/*
* In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size
* the inode's bmap btree: (max depth + 1) * block size
@@ -384,16 +534,8 @@ xfs_calc_itruncate_reservation(
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
- t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
-
- if (xfs_has_realtime(mp)) {
- t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- } else {
- t3 = 0;
- }
+ t2 = xfs_calc_finish_efi_reservation(mp, 4);
+ t3 = xfs_calc_finish_rt_efi_reservation(mp, 2);
/*
* In the early days of reflink, we included enough reservation to log
@@ -474,9 +616,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 3);
if (xfs_has_parent(mp)) {
unsigned int rename_overhead, exchange_overhead;
@@ -584,9 +724,7 @@ xfs_calc_link_reservation(
overhead += xfs_calc_iunlink_remove_reservation(mp);
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 1);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrsetm.tr_logres;
@@ -649,9 +787,7 @@ xfs_calc_remove_reservation(
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 2);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrrm.tr_logres;
@@ -1154,6 +1290,15 @@ xfs_calc_namespace_reservations(
resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
}
+STATIC void
+xfs_calc_default_atomic_ioend_reservation(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ /* Pick a default that will scale reasonably for the log size. */
+ resp->tr_atomic_ioend = resp->tr_itruncate;
+}
+
void
xfs_trans_resv_calc(
struct xfs_mount *mp,
@@ -1248,4 +1393,167 @@ xfs_trans_resv_calc(
resp->tr_itruncate.tr_logcount += logcount_adj;
resp->tr_write.tr_logcount += logcount_adj;
resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+
+ /*
+ * Now that we've finished computing the static reservations, we can
+ * compute the dynamic reservation for atomic writes.
+ */
+ xfs_calc_default_atomic_ioend_reservation(mp, resp);
+}
+
+/*
+ * Return the per-extent and fixed transaction reservation sizes needed to
+ * complete an atomic write.
+ */
+STATIC unsigned int
+xfs_calc_atomic_write_ioend_geometry(
+ struct xfs_mount *mp,
+ unsigned int *step_size)
+{
+ const unsigned int efi = xfs_efi_log_space(1);
+ const unsigned int efd = xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1);
+ const unsigned int rud = xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1);
+ const unsigned int cud = xfs_cud_log_space();
+ const unsigned int bui = xfs_bui_log_space(1);
+ const unsigned int bud = xfs_bud_log_space();
+
+ /*
+ * Maximum overhead to complete an atomic write ioend in software:
+ * remove data fork extent + remove cow fork extent + map extent into
+ * data fork.
+ *
+ * tx0: Creates a BUI and a CUI and that's all it needs.
+ *
+ * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
+ * enough space to relog the CUI (== CUI + CUD).
+ *
+ * tx2: Roll again to finish the RUI. Need space for the RUD and space
+ * to relog the CUI.
+ *
+ * tx3: Roll again, need space for the CUD and possibly a new EFI.
+ *
+ * tx4: Roll again, need space for an EFD.
+ *
+ * If the extent referenced by the pair of BUI/CUI items is not the one
+ * being currently processed, then we need to reserve space to relog
+ * both items.
+ */
+ const unsigned int tx0 = bui + cui;
+ const unsigned int tx1 = bud + rui + cui + cud;
+ const unsigned int tx2 = rud + cui + cud;
+ const unsigned int tx3 = cud + efi;
+ const unsigned int tx4 = efd;
+ const unsigned int relog = bui + bud + cui + cud;
+
+ const unsigned int per_intent = max(max3(tx0, tx1, tx2),
+ max3(tx3, tx4, relog));
+
+ /* Overhead to finish one step of each intent item type */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
+ const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1);
+
+ /* We only finish one item per transaction in a chain */
+ *step_size = max(f4, max3(f1, f2, f3));
+
+ return per_intent;
+}
+
+/*
+ * Compute the maximum size (in fsblocks) of atomic writes that we can complete
+ * given the existing log reservations.
+ */
+xfs_extlen_t
+xfs_calc_max_atomic_write_fsblocks(
+ struct xfs_mount *mp)
+{
+ const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend;
+ unsigned int per_intent = 0;
+ unsigned int step_size = 0;
+ unsigned int ret = 0;
+
+ if (resv->tr_logres > 0) {
+ per_intent = xfs_calc_atomic_write_ioend_geometry(mp,
+ &step_size);
+
+ if (resv->tr_logres >= step_size)
+ ret = (resv->tr_logres - step_size) / per_intent;
+ }
+
+ trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size,
+ resv->tr_logres, ret);
+
+ return ret;
+}
+
+/*
+ * Compute the log blocks and transaction reservation needed to complete an
+ * atomic write of a given number of blocks. Worst case, each block requires
+ * separate handling. A return value of 0 means something went wrong.
+ */
+xfs_extlen_t
+xfs_calc_atomic_write_log_geometry(
+ struct xfs_mount *mp,
+ xfs_extlen_t blockcount,
+ unsigned int *new_logres)
+{
+ struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend;
+ uint old_logres = curr_res->tr_logres;
+ unsigned int per_intent, step_size;
+ unsigned int logres;
+ xfs_extlen_t min_logblocks;
+
+ ASSERT(blockcount > 0);
+
+ xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+
+ per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
+
+ /* Check for overflows */
+ if (check_mul_overflow(blockcount, per_intent, &logres) ||
+ check_add_overflow(logres, step_size, &logres))
+ return 0;
+
+ curr_res->tr_logres = logres;
+ min_logblocks = xfs_log_calc_minimum_size(mp);
+ curr_res->tr_logres = old_logres;
+
+ trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
+ blockcount, min_logblocks, logres);
+
+ *new_logres = logres;
+ return min_logblocks;
+}
+
+/*
+ * Compute the transaction reservation needed to complete an out of place
+ * atomic write of a given number of blocks.
+ */
+int
+xfs_calc_atomic_write_reservation(
+ struct xfs_mount *mp,
+ xfs_extlen_t blockcount)
+{
+ unsigned int new_logres;
+ xfs_extlen_t min_logblocks;
+
+ /*
+ * If the caller doesn't ask for a specific atomic write size, then
+ * use the defaults.
+ */
+ if (blockcount == 0) {
+ xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+ return 0;
+ }
+
+ min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
+ &new_logres);
+ if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
+ return -EINVAL;
+
+ M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d2..336279e0fc61 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -48,6 +48,7 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
+ struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */
};
/* shorthand way of accessing reservation structure */
@@ -98,8 +99,32 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
+ xfs_extlen_t blockcount, unsigned int *new_logres);
+int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
+ xfs_extlen_t blockcount);
+
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 1155ff2d37e2..d89b570aafcc 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -14,6 +14,19 @@
#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \
(((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
+/* Worst case number of realtime rmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) \
+ (((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0]))
+
+/* Adding one realtime rmap could split every level to the top of the tree. */
+#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels)
+
+/* Blocks we might need to add "b" realtime rmaps to a tree. */
+#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \
+ ((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \
+ XFS_RTRMAPADD_SPACE_RES(mp))
+
/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index c299b16c9365..1faf04204c5d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -12,6 +12,8 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
/*
@@ -111,7 +113,7 @@ xfs_verify_ino(
/* Is this an internal inode number? */
inline bool
-xfs_internal_inum(
+xfs_is_sb_inum(
struct xfs_mount *mp,
xfs_ino_t ino)
{
@@ -129,24 +131,42 @@ xfs_verify_dir_ino(
struct xfs_mount *mp,
xfs_ino_t ino)
{
- if (xfs_internal_inum(mp, ino))
+ if (xfs_is_sb_inum(mp, ino))
return false;
return xfs_verify_ino(mp, ino);
}
/*
- * Verify that an realtime block number pointer doesn't point off the
- * end of the realtime device.
+ * Verify that a realtime block number pointer neither points outside the
+ * allocatable areas of the rtgroup nor off the end of the realtime
+ * device.
*/
inline bool
xfs_verify_rtbno(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
{
+ if (xfs_has_rtgroups(mp)) {
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+ xfs_rtxnum_t rtx = xfs_rtb_to_rtx(mp, rtbno);
+
+ if (rgno >= mp->m_sb.sb_rgcount)
+ return false;
+ if (rtx >= xfs_rtgroup_extents(mp, rgno))
+ return false;
+ if (xfs_has_rtsb(mp) && rgno == 0 && rtx == 0)
+ return false;
+ return true;
+ }
+
return rtbno < mp->m_sb.sb_rblocks;
}
-/* Verify that a realtime device extent is fully contained inside the volume. */
+/*
+ * Verify that an allocated realtime device extent neither points outside
+ * allocatable areas of the rtgroup, across an rtgroup boundary, nor off the
+ * end of the realtime device.
+ */
bool
xfs_verify_rtbext(
struct xfs_mount *mp,
@@ -159,7 +179,14 @@ xfs_verify_rtbext(
if (!xfs_verify_rtbno(mp, rtbno))
return false;
- return xfs_verify_rtbno(mp, rtbno + len - 1);
+ if (!xfs_verify_rtbno(mp, rtbno + len - 1))
+ return false;
+
+ if (xfs_has_rtgroups(mp) &&
+ xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1))
+ return false;
+
+ return true;
}
/* Calculate the range of valid icount values. */
@@ -170,13 +197,12 @@ xfs_icount_range(
unsigned long long *max)
{
unsigned long long nr_inos = 0;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
/* root, rtbitmap, rtsum all live in the first chunk */
*min = XFS_INODES_PER_CHUNK;
- for_each_perag(mp, agno, pag)
+ while ((pag = xfs_perag_next(mp, pag)))
nr_inos += pag->agino_max - pag->agino_min + 1;
*max = nr_inos;
}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index a8cd44d03ef6..f6f4f2d4b5db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -9,10 +9,12 @@
typedef uint32_t prid_t; /* project ID */
typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
+typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
+typedef uint32_t xfs_rgnumber_t; /* realtime group number */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
@@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t;
#define NULLFILEOFF ((xfs_fileoff_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
+#define NULLRGBLOCK ((xfs_rgblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
+#define NULLRGNUMBER ((xfs_rgnumber_t)-1)
#define NULLCOMMITLSN ((xfs_lsn_t)-1)
@@ -198,6 +202,13 @@ enum xfs_ag_resv_type {
* altering fdblocks. If you think you need this you're wrong.
*/
XFS_AG_RESV_IGNORE,
+
+ /*
+ * This allocation activity is being done on behalf of a metadata file.
+ * These files maintain their own permanent space reservations and are
+ * required to adjust fdblocks using the xfs_metafile_resv_* helpers.
+ */
+ XFS_AG_RESV_METAFILE,
};
/* Results of scanning a btree keyspace to check occupancy. */
@@ -212,6 +223,44 @@ enum xbtree_recpacking {
XBTREE_RECPACKING_FULL,
};
+enum xfs_group_type {
+ XG_TYPE_AG,
+ XG_TYPE_RTG,
+ XG_TYPE_MAX,
+} __packed;
+
+#define XG_TYPE_STRINGS \
+ { XG_TYPE_AG, "ag" }, \
+ { XG_TYPE_RTG, "rtg" }
+
+enum xfs_free_counter {
+ /*
+ * Number of free blocks on the data device.
+ */
+ XC_FREE_BLOCKS,
+
+ /*
+ * Number of free RT extents on the RT device.
+ */
+ XC_FREE_RTEXTENTS,
+
+ /*
+ * Number of available for use RT extents.
+ *
+ * This counter only exists for zoned RT device and indicates the number
+ * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
+ * also includes blocks that have been written previously and freed, but
+ * sit in a rtgroup that still needs a zone reset.
+ */
+ XC_FREE_RTAVAILABLE,
+ XC_FREE_NR,
+};
+
+#define XFS_FREECOUNTER_STR \
+ { XC_FREE_BLOCKS, "blocks" }, \
+ { XC_FREE_RTEXTENTS, "rtextents" }, \
+ { XC_FREE_RTAVAILABLE, "rtavailable" }
+
/*
* Type verifier functions
*/
@@ -222,7 +271,7 @@ bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
xfs_fsblock_t len);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
-bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_is_sb_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
new file mode 100644
index 000000000000..b0791a71931c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zones.h"
+
+static bool
+xfs_zone_validate_empty(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > 0) {
+ xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ *write_pointer = 0;
+ return true;
+}
+
+static bool
+xfs_zone_validate_wp(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+ xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
+ rtg_rgno(rtg), wp_fsb);
+ return false;
+ }
+
+ *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+ if (*write_pointer >= rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
+ rtg_rgno(rtg), *write_pointer);
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+xfs_zone_validate_full(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ *write_pointer = rtg->rtg_extents;
+ return true;
+}
+
+static bool
+xfs_zone_validate_seq(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ return xfs_zone_validate_empty(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return xfs_zone_validate_wp(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_FULL:
+ return xfs_zone_validate_full(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ default:
+ xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ }
+}
+
+static bool
+xfs_zone_validate_conv(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ return true;
+ default:
+ xfs_warn(mp,
+"conventional zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ }
+}
+
+bool
+xfs_zone_validate(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ uint32_t expected_size;
+
+ /*
+ * Check that the zone capacity matches the rtgroup size stored in the
+ * superblock. Note that all zones including the last one must have a
+ * uniform capacity.
+ */
+ if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+ xfs_warn(mp,
+"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+ g->blocks);
+ return false;
+ }
+
+ if (g->has_daddr_gaps) {
+ expected_size = 1 << g->blklog;
+ } else {
+ if (zone->len != zone->capacity) {
+ xfs_warn(mp,
+"zone %u has capacity != size ((0x%llx vs 0x%llx)",
+ rtg_rgno(rtg),
+ XFS_BB_TO_FSB(mp, zone->len),
+ XFS_BB_TO_FSB(mp, zone->capacity));
+ return false;
+ }
+ expected_size = g->blocks;
+ }
+
+ if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
+ xfs_warn(mp,
+"zone %u length (0x%llx) does match geometry (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+ expected_size);
+ }
+
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ return xfs_zone_validate_conv(zone, rtg);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ return xfs_zone_validate_seq(zone, rtg, write_pointer);
+ default:
+ xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
+ rtg_rgno(rtg), zone->type);
+ return false;
+ }
+}
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
new file mode 100644
index 000000000000..c4f1367b2cca
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+struct xfs_rtgroup;
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones: one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeping.
+ */
+#define XFS_GC_ZONES 3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES 1U
+#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+
+bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer);
+
+#endif /* _LIBXFS_ZONES_H */