summaryrefslogtreecommitdiff
path: root/fs/xfs/libxfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c23
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c575
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h67
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_btree.h16
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h97
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c70
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h28
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h118
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c1698
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h70
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c451
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h74
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c914
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c82
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h7
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c23
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h9
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
29 files changed, 4331 insertions, 93 deletions
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e3ae0f2b4294..e5ebc3770460 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -38,6 +38,7 @@
#include "xfs_trans_space.h"
#include "xfs_rmap_btree.h"
#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
/*
* Per-AG Block Reservations
@@ -108,7 +109,9 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+ return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+ pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL,
+ XFS_RANDOM_AG_RESV_CRITICAL);
}
/*
@@ -228,6 +231,11 @@ xfs_ag_resv_init(
if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0;
+ error = xfs_refcountbt_calc_reserves(pag->pag_mount,
+ pag->pag_agno, &ask, &used);
+ if (error)
+ goto out;
+
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
ask, used);
if (error)
@@ -238,6 +246,11 @@ xfs_ag_resv_init(
if (pag->pag_agfl_resv.ar_asked == 0) {
ask = used = 0;
+ error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
+ &ask, &used);
+ if (error)
+ goto out;
+
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
if (error)
goto out;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ca75dc90ebe0..effb64cf714f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -52,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+unsigned int
+xfs_refc_block(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return XFS_RMAP_BLOCK(mp) + 1;
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ return XFS_FIBT_BLOCK(mp) + 1;
+ return XFS_IBT_BLOCK(mp) + 1;
+}
+
xfs_extlen_t
xfs_prealloc_blocks(
struct xfs_mount *mp)
{
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ return xfs_refc_block(mp) + 1;
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
return XFS_RMAP_BLOCK(mp) + 1;
if (xfs_sb_version_hasfinobt(&mp->m_sb))
@@ -115,6 +128,8 @@ xfs_alloc_ag_max_usable(
blocks++; /* finobt root block */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
blocks++; /* rmap root block */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ blocks++; /* refcount root block */
return mp->m_sb.sb_agblocks - blocks;
}
@@ -2321,6 +2336,9 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_btreeblks),
offsetof(xfs_agf_t, agf_uuid),
offsetof(xfs_agf_t, agf_rmap_blocks),
+ offsetof(xfs_agf_t, agf_refcount_blocks),
+ offsetof(xfs_agf_t, agf_refcount_root),
+ offsetof(xfs_agf_t, agf_refcount_level),
/* needed so that we don't log the whole rest of the structure: */
offsetof(xfs_agf_t, agf_spare64),
sizeof(xfs_agf_t)
@@ -2458,6 +2476,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
return false;
+ if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+ return false;
+
return true;;
}
@@ -2578,6 +2600,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
pag->pagf_levels[XFS_BTNUM_RMAPi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9d7f61d36645..c27344cf38e1 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -48,6 +48,7 @@
#include "xfs_filestream.h"
#include "xfs_rmap.h"
#include "xfs_ag_resv.h"
+#include "xfs_refcount.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -140,7 +141,8 @@ xfs_bmbt_lookup_ge(
*/
static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
{
- return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ return whichfork != XFS_COW_FORK &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork);
}
@@ -150,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
*/
static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
{
- return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ return whichfork != XFS_COW_FORK &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
XFS_IFORK_NEXTENTS(ip, whichfork) <=
XFS_IFORK_MAXEXT(ip, whichfork);
}
@@ -640,6 +643,7 @@ xfs_bmap_btree_to_extents(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
rblock = ifp->if_broot;
@@ -706,6 +710,7 @@ xfs_bmap_extents_to_btree(
xfs_bmbt_ptr_t *pp; /* root block address pointer */
mp = ip->i_mount;
+ ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
@@ -748,6 +753,7 @@ xfs_bmap_extents_to_btree(
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
} else if (dfops->dop_low) {
+try_another_ag:
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = *firstblock;
} else {
@@ -762,6 +768,21 @@ xfs_bmap_extents_to_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
+
+ /*
+ * During a CoW operation, the allocation and bmbt updates occur in
+ * different transactions. The mapping code tries to put new bmbt
+ * blocks near extents being mapped, but the only way to guarantee this
+ * is if the alloc and the mapping happen in a single transaction that
+ * has a block reservation. That isn't the case here, so if we run out
+ * of space we'll try again with another AG.
+ */
+ if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+ dfops->dop_low = true;
+ goto try_another_ag;
+ }
/*
* Allocation can't fail, the space was reserved.
*/
@@ -837,6 +858,7 @@ xfs_bmap_local_to_extents_empty(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(whichfork != XFS_COW_FORK);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
ASSERT(ifp->if_bytes == 0);
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
@@ -896,6 +918,7 @@ xfs_bmap_local_to_extents(
* file currently fits in an inode.
*/
if (*firstblock == NULLFSBLOCK) {
+try_another_ag:
args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -908,6 +931,19 @@ xfs_bmap_local_to_extents(
if (error)
goto done;
+ /*
+ * During a CoW operation, the allocation and bmbt updates occur in
+ * different transactions. The mapping code tries to put new bmbt
+ * blocks near extents being mapped, but the only way to guarantee this
+ * is if the alloc and the mapping happen in a single transaction that
+ * has a block reservation. That isn't the case here, so if we run out
+ * of space we'll try again with another AG.
+ */
+ if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+ goto try_another_ag;
+ }
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
@@ -1670,7 +1706,8 @@ xfs_bmap_one_block(
*/
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
- struct xfs_bmalloca *bma)
+ struct xfs_bmalloca *bma,
+ int whichfork)
{
struct xfs_bmbt_irec *new = &bma->got;
int diff; /* temp value */
@@ -1688,11 +1725,14 @@ xfs_bmap_add_extent_delay_real(
xfs_filblks_t temp=0; /* value for da_new calculations */
xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
- int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp;
+ xfs_extnum_t *nextents;
mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ ASSERT(whichfork != XFS_ATTR_FORK);
+ nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
+ &bma->ip->i_d.di_nextents);
ASSERT(bma->idx >= 0);
ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1706,6 +1746,9 @@ xfs_bmap_add_extent_delay_real(
#define RIGHT r[1]
#define PREV r[2]
+ if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
+
/*
* Set up a bunch of variables to make the tests simpler.
*/
@@ -1792,7 +1835,7 @@ xfs_bmap_add_extent_delay_real(
trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
- bma->ip->i_d.di_nextents--;
+ (*nextents)--;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1894,7 +1937,7 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_set_startblock(ep, new->br_startblock);
trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1964,7 +2007,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2048,7 +2091,7 @@ xfs_bmap_add_extent_delay_real(
trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2117,7 +2160,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount = temp2;
/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2215,7 +2258,8 @@ xfs_bmap_add_extent_delay_real(
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
- bma->logflags |= rval;
+ if (whichfork != XFS_COW_FORK)
+ bma->logflags |= rval;
return error;
#undef LEFT
#undef RIGHT
@@ -2759,6 +2803,7 @@ done:
STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork,
xfs_extnum_t *idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
@@ -2770,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay(
int state; /* state bits, accessed thru macros */
xfs_filblks_t temp=0; /* temp for indirect calculations */
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
state = 0;
+ if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
ASSERT(isnullstartblock(new->br_startblock));
/*
@@ -2789,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
@@ -2923,6 +2970,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(whichfork != XFS_COW_FORK);
XFS_STATS_INC(mp, xs_add_exlist);
@@ -3648,7 +3696,9 @@ xfs_bmap_btalloc(
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
- if (xfs_alloc_is_userdata(ap->datatype))
+ if (ap->flags & XFS_BMAPI_COWFORK)
+ align = xfs_get_cowextsz_hint(ap->ip);
+ else if (xfs_alloc_is_userdata(ap->datatype))
align = xfs_get_extsz_hint(ap->ip);
if (unlikely(align)) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3856,7 +3906,8 @@ xfs_bmap_btalloc(
ASSERT(nullfb || fb_agno == args.agno ||
(ap->dfops->dop_low && fb_agno < args.agno));
ap->length = args.len;
- ap->ip->i_d.di_nblocks += args.len;
+ if (!(ap->flags & XFS_BMAPI_COWFORK))
+ ap->ip->i_d.di_nblocks += args.len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
ap->ip->i_delayed_blks -= args.len;
@@ -3876,6 +3927,63 @@ xfs_bmap_btalloc(
}
/*
+ * For a remap operation, just "allocate" an extent at the address that the
+ * caller passed in, and ensure that the AGFL is the right size. The caller
+ * will then map the "allocated" extent into the file somewhere.
+ */
+STATIC int
+xfs_bmap_remap_alloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_trans *tp = ap->tp;
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agblock_t bno;
+ struct xfs_alloc_arg args;
+ int error;
+
+ /*
+ * validate that the block number is legal - the enables us to detect
+ * and handle a silent filesystem corruption rather than crashing.
+ */
+ memset(&args, 0, sizeof(struct xfs_alloc_arg));
+ args.tp = ap->tp;
+ args.mp = ap->tp->t_mountp;
+ bno = *ap->firstblock;
+ args.agno = XFS_FSB_TO_AGNO(mp, bno);
+ args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ if (args.agno >= mp->m_sb.sb_agcount ||
+ args.agbno >= mp->m_sb.sb_agblocks)
+ return -EFSCORRUPTED;
+
+ /* "Allocate" the extent from the range we passed in. */
+ trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
+ ap->blkno = bno;
+ ap->ip->i_d.di_nblocks += ap->length;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+
+ /* Fix the freelist, like a real allocator does. */
+ args.datatype = ap->datatype;
+ args.pag = xfs_perag_get(args.mp, args.agno);
+ ASSERT(args.pag);
+
+ /*
+ * The freelist fixing code will decline the allocation if
+ * the size and shape of the free space doesn't allow for
+ * allocating the extent and updating all the metadata that
+ * happens during an allocation. We're remapping, not
+ * allocating, so skip that check by pretending to be freeing.
+ */
+ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+ if (error)
+ goto error0;
+error0:
+ xfs_perag_put(args.pag);
+ if (error)
+ trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
+ return error;
+}
+
+/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
* It figures out where to ask the underlying allocator to put the new extent.
*/
@@ -3883,6 +3991,8 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
+ if (ap->flags & XFS_BMAPI_REMAP)
+ return xfs_bmap_remap_alloc(ap);
if (XFS_IS_REALTIME_INODE(ap->ip) &&
xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
@@ -4012,12 +4122,11 @@ xfs_bmapi_read(
int error;
int eof;
int n = 0;
- int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ int whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
- XFS_BMAPI_IGSTATE)));
+ XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK)));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
@@ -4035,6 +4144,16 @@ xfs_bmapi_read(
ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* No CoW fork? Return a hole. */
+ if (whichfork == XFS_COW_FORK && !ifp) {
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount = len;
+ mval->br_state = XFS_EXT_NORM;
+ *nmap = 1;
+ return 0;
+ }
+
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
if (error)
@@ -4084,6 +4203,7 @@ xfs_bmapi_read(
int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
+ int whichfork,
xfs_fileoff_t aoff,
xfs_filblks_t len,
struct xfs_bmbt_irec *got,
@@ -4092,7 +4212,7 @@ xfs_bmapi_reserve_delalloc(
int eof)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
char rt = XFS_IS_REALTIME_INODE(ip);
@@ -4104,7 +4224,10 @@ xfs_bmapi_reserve_delalloc(
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
/* Figure out the extent size, adjust alen */
- extsz = xfs_get_extsz_hint(ip);
+ if (whichfork == XFS_COW_FORK)
+ extsz = xfs_get_cowextsz_hint(ip);
+ else
+ extsz = xfs_get_extsz_hint(ip);
if (extsz) {
error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
1, 0, &aoff, &alen);
@@ -4151,7 +4274,7 @@ xfs_bmapi_reserve_delalloc(
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
- xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
/*
* Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
@@ -4182,8 +4305,7 @@ xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
{
struct xfs_mount *mp = bma->ip->i_mount;
- int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ int whichfork = xfs_bmapi_whichfork(bma->flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4278,7 +4400,7 @@ xfs_bmapi_allocate(
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
- error = xfs_bmap_add_extent_delay_real(bma);
+ error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
error = xfs_bmap_add_extent_hole_real(bma, whichfork);
@@ -4308,8 +4430,7 @@ xfs_bmapi_convert_unwritten(
xfs_filblks_t len,
int flags)
{
- int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ int whichfork = xfs_bmapi_whichfork(flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4325,6 +4446,8 @@ xfs_bmapi_convert_unwritten(
(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
return 0;
+ ASSERT(whichfork != XFS_COW_FORK);
+
/*
* Modify (by adding) the state flag, if writing.
*/
@@ -4431,8 +4554,7 @@ xfs_bmapi_write(
orig_mval = mval;
orig_nmap = *nmap;
#endif
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
@@ -4441,6 +4563,11 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+ ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+ ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
/* zeroing is for currently only for data extents, not metadata */
ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4502,6 +4629,14 @@ xfs_bmapi_write(
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
/*
+ * Make sure we only reflink into a hole.
+ */
+ if (flags & XFS_BMAPI_REMAP)
+ ASSERT(inhole);
+ if (flags & XFS_BMAPI_COWFORK)
+ ASSERT(!inhole);
+
+ /*
* First, deal with the hole before the allocated space
* that we found, if any.
*/
@@ -4531,6 +4666,17 @@ xfs_bmapi_write(
goto error0;
if (bma.blkno == NULLFSBLOCK)
break;
+
+ /*
+ * If this is a CoW allocation, record the data in
+ * the refcount btree for orphan recovery.
+ */
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(mp, dfops,
+ bma.blkno, bma.length);
+ if (error)
+ goto error0;
+ }
}
/* Deal with the allocated space we found. */
@@ -4696,7 +4842,8 @@ xfs_bmap_del_extent(
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
+ int whichfork, /* data or attr fork */
+ int bflags) /* bmapi flags */
{
xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -4725,6 +4872,8 @@ xfs_bmap_del_extent(
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
+ else if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
@@ -4805,6 +4954,7 @@ xfs_bmap_del_extent(
/*
* Matches the whole extent. Delete the entry.
*/
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_iext_remove(ip, *idx, 1,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
--*idx;
@@ -4988,9 +5138,16 @@ xfs_bmap_del_extent(
/*
* If we need to, add to list of extents to delete.
*/
- if (do_fx)
- xfs_bmap_add_free(mp, dfops, del->br_startblock,
- del->br_blockcount, NULL);
+ if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
+ error = xfs_refcount_decrease_extent(mp, dfops, del);
+ if (error)
+ goto done;
+ } else
+ xfs_bmap_add_free(mp, dfops, del->br_startblock,
+ del->br_blockcount, NULL);
+ }
+
/*
* Adjust inode # blocks in the file.
*/
@@ -4999,7 +5156,7 @@ xfs_bmap_del_extent(
/*
* Adjust quota data.
*/
- if (qfield)
+ if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
/*
@@ -5014,6 +5171,175 @@ done:
return error;
}
+/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */
+int
+xfs_bunmapi_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *del)
+{
+ xfs_filblks_t da_new;
+ xfs_filblks_t da_old;
+ xfs_fsblock_t del_endblock = 0;
+ xfs_fileoff_t del_endoff;
+ int delay;
+ struct xfs_bmbt_rec_host *ep;
+ int error;
+ struct xfs_bmbt_irec got;
+ xfs_fileoff_t got_endoff;
+ struct xfs_ifork *ifp;
+ struct xfs_mount *mp;
+ xfs_filblks_t nblks;
+ struct xfs_bmbt_irec new;
+ /* REFERENCED */
+ uint qfield;
+ xfs_filblks_t temp;
+ xfs_filblks_t temp2;
+ int state = BMAP_COWFORK;
+ int eof;
+ xfs_extnum_t eidx;
+
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof,
+ &eidx, &got, &new);
+
+ ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp;
+ ASSERT((eidx >= 0) && (eidx < ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got.br_startoff <= del->br_startoff);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got.br_startoff + got.br_blockcount;
+ ASSERT(got_endoff >= del_endoff);
+ delay = isnullstartblock(got.br_startblock);
+ ASSERT(isnullstartblock(del->br_startblock) == delay);
+ qfield = 0;
+ error = 0;
+ /*
+ * If deleting a real allocation, must free up the disk space.
+ */
+ if (!delay) {
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
+ /*
+ * Set up del_endblock and cur for later.
+ */
+ del_endblock = del->br_startblock + del->br_blockcount;
+ da_old = da_new = 0;
+ } else {
+ da_old = startblockval(got.br_startblock);
+ da_new = 0;
+ nblks = 0;
+ }
+ qfield = qfield;
+ nblks = nblks;
+
+ /*
+ * Set flag value to use in switch statement.
+ * Left-contig is 2, right-contig is 1.
+ */
+ switch (((got.br_startoff == del->br_startoff) << 1) |
+ (got_endoff == del_endoff)) {
+ case 3:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK);
+ --eidx;
+ break;
+
+ case 2:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+ xfs_bmbt_set_startoff(ep, del_endoff);
+ temp = got.br_blockcount - del->br_blockcount;
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ xfs_bmbt_set_startblock(ep, del_endblock);
+ trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+ break;
+
+ case 1:
+ /*
+ * Deleting the last part of the extent.
+ */
+ temp = got.br_blockcount - del->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+ break;
+
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ temp = del->br_startoff - got.br_startoff;
+ trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ new.br_startoff = del_endoff;
+ temp2 = got_endoff - del_endoff;
+ new.br_blockcount = temp2;
+ new.br_state = got.br_state;
+ if (!delay) {
+ new.br_startblock = del_endblock;
+ } else {
+ temp = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ temp2 = xfs_bmap_worst_indlen(ip, temp2);
+ new.br_startblock = nullstartblock((int)temp2);
+ da_new = temp + temp2;
+ while (da_new > da_old) {
+ if (temp) {
+ temp--;
+ da_new--;
+ xfs_bmbt_set_startblock(ep,
+ nullstartblock((int)temp));
+ }
+ if (da_new == da_old)
+ break;
+ if (temp2) {
+ temp2--;
+ da_new--;
+ new.br_startblock =
+ nullstartblock((int)temp2);
+ }
+ }
+ }
+ trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+ xfs_iext_insert(ip, eidx + 1, 1, &new, state);
+ ++eidx;
+ break;
+ }
+
+ /*
+ * Account for change in delayed indirect blocks.
+ * Nothing to do for disk quota accounting here.
+ */
+ ASSERT(da_old >= da_new);
+ if (da_old > da_new)
+ xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
+
+ return error;
+}
+
/*
* Unmap (remove) blocks from a file.
* If nexts is nonzero then the number of extents to remove is limited to
@@ -5021,17 +5347,16 @@ done:
* *done is set.
*/
int /* error */
-xfs_bunmapi(
+__xfs_bunmapi(
xfs_trans_t *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting offset to unmap */
- xfs_filblks_t len, /* length to unmap in file */
+ xfs_filblks_t *rlen, /* i/o: amount remaining */
int flags, /* misc flags */
xfs_extnum_t nexts, /* number of extents max */
xfs_fsblock_t *firstblock, /* first allocated block
controls a.g. for allocs */
- struct xfs_defer_ops *dfops, /* i/o: list extents to free */
- int *done) /* set if not done yet */
+ struct xfs_defer_ops *dfops) /* i/o: deferred updates */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
xfs_bmbt_irec_t del; /* extent being deleted */
@@ -5053,11 +5378,12 @@ xfs_bunmapi(
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
+ xfs_filblks_t len = *rlen; /* length to unmap in file */
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ whichfork = xfs_bmapi_whichfork(flags);
+ ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
if (unlikely(
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5079,7 +5405,7 @@ xfs_bunmapi(
return error;
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
if (nextents == 0) {
- *done = 1;
+ *rlen = 0;
return 0;
}
XFS_STATS_INC(mp, xs_blk_unmap);
@@ -5324,7 +5650,7 @@ xfs_bunmapi(
cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
- &tmp_logflags, whichfork);
+ &tmp_logflags, whichfork, flags);
logflags |= tmp_logflags;
if (error)
goto error0;
@@ -5350,7 +5676,10 @@ nodelete:
extno++;
}
}
- *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+ if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0)
+ *rlen = 0;
+ else
+ *rlen = bno - start + 1;
/*
* Convert to a btree if necessary.
@@ -5406,6 +5735,27 @@ error0:
return error;
}
+/* Unmap a range of a file. */
+int
+xfs_bunmapi(
+ xfs_trans_t *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_extnum_t nexts,
+ xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops,
+ int *done)
+{
+ int error;
+
+ error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock,
+ dfops);
+ *done = (len == 0);
+ return error;
+}
+
/*
* Determine whether an extent shift can be accomplished by a merge with the
* extent that precedes the target hole of the shift.
@@ -5985,3 +6335,146 @@ out:
xfs_trans_cancel(tp);
return error;
}
+
+/* Deferred mapping is only for real extents in the data fork. */
+static bool
+xfs_bmap_is_update_needed(
+ struct xfs_bmbt_irec *bmap)
+{
+ return bmap->br_startblock != HOLESTARTBLOCK &&
+ bmap->br_startblock != DELAYSTARTBLOCK;
+}
+
+/* Record a bmap intent. */
+static int
+__xfs_bmap_add(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_bmap_intent_type type,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *bmap)
+{
+ int error;
+ struct xfs_bmap_intent *bi;
+
+ trace_xfs_bmap_defer(mp,
+ XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+ type,
+ XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+ ip->i_ino, whichfork,
+ bmap->br_startoff,
+ bmap->br_blockcount,
+ bmap->br_state);
+
+ bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+ INIT_LIST_HEAD(&bi->bi_list);
+ bi->bi_type = type;
+ bi->bi_owner = ip;
+ bi->bi_whichfork = whichfork;
+ bi->bi_bmap = *bmap;
+
+ error = xfs_defer_join(dfops, bi->bi_owner);
+ if (error) {
+ kmem_free(bi);
+ return error;
+ }
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+ return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_bmap_map_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_bmap_is_update_needed(PREV))
+ return 0;
+
+ return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip,
+ XFS_DATA_FORK, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_bmap_unmap_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_bmap_is_update_needed(PREV))
+ return 0;
+
+ return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip,
+ XFS_DATA_FORK, PREV);
+}
+
+/*
+ * Process one of the deferred bmap operations. We pass back the
+ * btree cursor to maintain our lock on the bmapbt between calls.
+ */
+int
+xfs_bmap_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ enum xfs_bmap_intent_type type,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
+{
+ struct xfs_bmbt_irec bmap;
+ int nimaps = 1;
+ xfs_fsblock_t firstfsb;
+ int flags = XFS_BMAPI_REMAP;
+ int done;
+ int error = 0;
+
+ bmap.br_startblock = startblock;
+ bmap.br_startoff = startoff;
+ bmap.br_blockcount = blockcount;
+ bmap.br_state = state;
+
+ trace_xfs_bmap_deferred(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+ ip->i_ino, whichfork, startoff, blockcount, state);
+
+ if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ return -EFSCORRUPTED;
+ if (whichfork == XFS_ATTR_FORK)
+ flags |= XFS_BMAPI_ATTRFORK;
+
+ if (XFS_TEST_ERROR(false, tp->t_mountp,
+ XFS_ERRTAG_BMAP_FINISH_ONE,
+ XFS_RANDOM_BMAP_FINISH_ONE))
+ return -EIO;
+
+ switch (type) {
+ case XFS_BMAP_MAP:
+ firstfsb = bmap.br_startblock;
+ error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
+ bmap.br_blockcount, flags, &firstfsb,
+ bmap.br_blockcount, &bmap, &nimaps,
+ dfops);
+ break;
+ case XFS_BMAP_UNMAP:
+ error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+ bmap.br_blockcount, flags, 1, &firstfsb,
+ dfops, &done);
+ ASSERT(done);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8395f6e8cf7d..f97db7132564 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -97,6 +97,19 @@ struct xfs_extent_free_item
*/
#define XFS_BMAPI_ZERO 0x080
+/*
+ * Map the inode offset to the block given in ap->firstblock. Primarily
+ * used for reflink. The range must be in a hole, and this flag cannot be
+ * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
+ *
+ * For bunmapi, this flag unmaps the range without adjusting quota, reducing
+ * refcount, or freeing the blocks.
+ */
+#define XFS_BMAPI_REMAP 0x100
+
+/* Map something in the CoW fork. */
+#define XFS_BMAPI_COWFORK 0x200
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -105,12 +118,24 @@ struct xfs_extent_free_item
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
{ XFS_BMAPI_CONVERT, "CONVERT" }, \
- { XFS_BMAPI_ZERO, "ZERO" }
+ { XFS_BMAPI_ZERO, "ZERO" }, \
+ { XFS_BMAPI_REMAP, "REMAP" }, \
+ { XFS_BMAPI_COWFORK, "COWFORK" }
static inline int xfs_bmapi_aflag(int w)
{
- return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+ return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK :
+ (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
+}
+
+static inline int xfs_bmapi_whichfork(int bmapi_flags)
+{
+ if (bmapi_flags & XFS_BMAPI_COWFORK)
+ return XFS_COW_FORK;
+ else if (bmapi_flags & XFS_BMAPI_ATTRFORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
}
/*
@@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w)
#define BMAP_LEFT_VALID (1 << 6)
#define BMAP_RIGHT_VALID (1 << 7)
#define BMAP_ATTRFORK (1 << 8)
+#define BMAP_COWFORK (1 << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
{ BMAP_RIGHT_CONTIG, "RC" }, \
{ BMAP_LEFT_FILLING, "LF" }, \
{ BMAP_RIGHT_FILLING, "RF" }, \
- { BMAP_ATTRFORK, "ATTR" }
+ { BMAP_ATTRFORK, "ATTR" }, \
+ { BMAP_COWFORK, "COW" }
/*
@@ -186,10 +213,15 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fsblock_t *firstblock, xfs_extlen_t total,
struct xfs_bmbt_irec *mval, int *nmap,
struct xfs_defer_ops *dfops);
+int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops, int *done);
+int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del);
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
@@ -203,8 +235,31 @@ struct xfs_bmbt_rec_host *
xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
int fork, int *eofp, xfs_extnum_t *lastxp,
struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
-int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
- xfs_filblks_t len, struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
+int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t aoff, xfs_filblks_t len,
+ struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
+ xfs_extnum_t *lastx, int eof);
+
+enum xfs_bmap_intent_type {
+ XFS_BMAP_MAP = 1,
+ XFS_BMAP_UNMAP,
+};
+
+struct xfs_bmap_intent {
+ struct list_head bi_list;
+ enum xfs_bmap_intent_type bi_type;
+ struct xfs_inode *bi_owner;
+ int bi_whichfork;
+ struct xfs_bmbt_irec bi_bmap;
+};
+
+int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+ int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount, xfs_exntst_t state);
+int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cd85274e810c..8007d2ba9aef 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -453,6 +453,7 @@ xfs_bmbt_alloc_block(
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
+try_another_ag:
args.type = XFS_ALLOCTYPE_START_BNO;
/*
* Make sure there is sufficient room left in the AG to
@@ -482,6 +483,22 @@ xfs_bmbt_alloc_block(
if (error)
goto error0;
+ /*
+ * During a CoW operation, the allocation and bmbt updates occur in
+ * different transactions. The mapping code tries to put new bmbt
+ * blocks near extents being mapped, but the only way to guarantee this
+ * is if the alloc and the mapping happen in a single transaction that
+ * has a block reservation. That isn't the case here, so if we run out
+ * of space we'll try again with another AG.
+ */
+ if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+ cur->bc_private.b.dfops->dop_low = true;
+ args.fsbno = cur->bc_private.b.firstblock;
+ goto try_another_ag;
+ }
+
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
* Could not find an AG with enough free space to satisfy
@@ -777,6 +794,7 @@ xfs_bmbt_init_cursor(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_btree_cur *cur;
+ ASSERT(whichfork != XFS_COW_FORK);
cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index aa1752f918b8..5c8e6f2ce44f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -45,9 +45,10 @@ kmem_zone_t *xfs_btree_cur_zone;
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
- XFS_FIBT_MAGIC },
+ XFS_FIBT_MAGIC, 0 },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
+ XFS_REFC_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -1216,6 +1217,9 @@ xfs_btree_set_refs(
case XFS_BTNUM_RMAP:
xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
break;
+ case XFS_BTNUM_REFC:
+ xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
+ break;
default:
ASSERT(0);
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3f8556a5c2ad..c2b01d1c79ee 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -49,6 +49,7 @@ union xfs_btree_key {
struct xfs_inobt_key inobt;
struct xfs_rmap_key rmap;
struct xfs_rmap_key __rmap_bigkey[2];
+ struct xfs_refcount_key refc;
};
union xfs_btree_rec {
@@ -57,6 +58,7 @@ union xfs_btree_rec {
struct xfs_alloc_rec alloc;
struct xfs_inobt_rec inobt;
struct xfs_rmap_rec rmap;
+ struct xfs_refcount_rec refc;
};
/*
@@ -72,6 +74,7 @@ union xfs_btree_rec {
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
+#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
/*
* For logging record fields.
@@ -105,6 +108,7 @@ do { \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
+ case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -127,6 +131,8 @@ do { \
__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
case XFS_BTNUM_RMAP: \
__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
+ case XFS_BTNUM_REFC: \
+ __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -217,6 +223,15 @@ union xfs_btree_irec {
struct xfs_bmbt_irec b;
struct xfs_inobt_rec_incore i;
struct xfs_rmap_irec r;
+ struct xfs_refcount_irec rc;
+};
+
+/* Per-AG btree private information. */
+union xfs_btree_cur_private {
+ struct {
+ unsigned long nr_ops; /* # record updates */
+ int shape_changes; /* # of extent splits */
+ } refc;
};
/*
@@ -243,6 +258,7 @@ typedef struct xfs_btree_cur
struct xfs_buf *agbp; /* agf/agi buffer pointer */
struct xfs_defer_ops *dfops; /* deferred updates */
xfs_agnumber_t agno; /* ag number */
+ union xfs_btree_cur_private priv;
} a;
struct { /* needed for BMAP */
struct xfs_inode *ip; /* pointer to our inode */
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index e96533d178cf..f6e93ef0bffe 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -51,6 +51,8 @@ struct xfs_defer_pending {
* find all the space it needs.
*/
enum xfs_defer_ops_type {
+ XFS_DEFER_OPS_TYPE_BMAP,
+ XFS_DEFER_OPS_TYPE_REFCOUNT,
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_MAX,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 270fb5cf4fa1..f6547fc5e016 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -456,9 +456,11 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
+#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
- XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+ XFS_SB_FEAT_RO_COMPAT_REFLINK)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
}
+static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
+}
+
/*
* end of superblock version macros
*/
@@ -641,14 +649,17 @@ typedef struct xfs_agf {
uuid_t agf_uuid; /* uuid of filesystem */
__be32 agf_rmap_blocks; /* rmapbt blocks used */
- __be32 agf_padding; /* padding */
+ __be32 agf_refcount_blocks; /* refcountbt blocks used */
+
+ __be32 agf_refcount_root; /* refcount tree root block */
+ __be32 agf_refcount_level; /* refcount btree levels */
/*
* reserve some contiguous space for future logged fields before we add
* the unlogged fields. This makes the range logging via flags and
* structure offsets much simpler.
*/
- __be64 agf_spare64[15];
+ __be64 agf_spare64[14];
/* unlogged fields, written during buffer writeback. */
__be64 agf_lsn; /* last write sequence */
@@ -674,8 +685,11 @@ typedef struct xfs_agf {
#define XFS_AGF_BTREEBLKS 0x00000800
#define XFS_AGF_UUID 0x00001000
#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_SPARE64 0x00004000
-#define XFS_AGF_NUM_BITS 15
+#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
+#define XFS_AGF_REFCOUNT_ROOT 0x00008000
+#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
+#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_NUM_BITS 18
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
@@ -693,6 +707,9 @@ typedef struct xfs_agf {
{ XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
{ XFS_AGF_UUID, "UUID" }, \
{ XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \
+ { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \
+ { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \
+ { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \
{ XFS_AGF_SPARE64, "SPARE64" }
/* disk block (xfs_daddr_t) in the AG */
@@ -885,7 +902,8 @@ typedef struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
- __u8 di_pad2[16]; /* more padding for future expansion */
+ __be32 di_cowextsize; /* basic cow extent size for file */
+ __u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
xfs_timestamp_t di_crtime; /* time created */
@@ -1041,9 +1059,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* 16 bits of the XFS_XFLAG_s range.
*/
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
+#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX)
+#define XFS_DIFLAG2_ANY \
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
/*
* Inode number format:
@@ -1353,7 +1376,9 @@ struct xfs_owner_info {
#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */
#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */
#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */
-#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */
+#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */
+#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */
+#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */
#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63)))
@@ -1434,6 +1459,62 @@ typedef __be32 xfs_rmap_ptr_t;
XFS_IBT_BLOCK(mp) + 1)
/*
+ * Reference Count Btree format definitions
+ *
+ */
+#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */
+
+unsigned int xfs_refc_block(struct xfs_mount *mp);
+
+/*
+ * Data record/key structure
+ *
+ * Each record associates a range of physical blocks (starting at
+ * rc_startblock and ending rc_blockcount blocks later) with a reference
+ * count (rc_refcount). Extents that are being used to stage a copy on
+ * write (CoW) operation are recorded in the refcount btree with a
+ * refcount of 1. All other records must have a refcount > 1 and must
+ * track an extent mapped only by file data forks.
+ *
+ * Extents with a single owner (attributes, metadata, non-shared file
+ * data) are not tracked here. Free space is also not tracked here.
+ * This is consistent with pre-reflink XFS.
+ */
+
+/*
+ * Extents that are being used to stage a copy on write are stored
+ * in the refcount btree with a refcount of 1 and the upper bit set
+ * on the startblock. This speeds up mount time deletion of stale
+ * staging extents because they're all at the right side of the tree.
+ */
+#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31))
+#define REFCNTBT_COWFLAG_BITLEN 1
+#define REFCNTBT_AGBLOCK_BITLEN 31
+
+struct xfs_refcount_rec {
+ __be32 rc_startblock; /* starting block number */
+ __be32 rc_blockcount; /* count of blocks */
+ __be32 rc_refcount; /* number of inodes linked here */
+};
+
+struct xfs_refcount_key {
+ __be32 rc_startblock; /* starting block number */
+};
+
+struct xfs_refcount_irec {
+ xfs_agblock_t rc_startblock; /* starting block number */
+ xfs_extlen_t rc_blockcount; /* count of free blocks */
+ xfs_nlink_t rc_refcount; /* number of inodes linked here */
+};
+
+#define MAXREFCOUNT ((xfs_nlink_t)~0U)
+#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
+
+/* btree pointer type */
+typedef __be32 xfs_refcount_ptr_t;
+
+
+/*
* BMAP Btree format definitions
*
* This includes both the root block definition that sits inside an inode fork
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 79455058b752..b72dc821d78b 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -81,14 +81,16 @@ struct getbmapx {
#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
+#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */
#define BMV_IF_VALID \
(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
- BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+ BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)
/* bmv_oflags values - returned for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
+#define BMV_OF_SHARED 0x8 /* segment shared with another file */
/*
* Structure for XFS_IOC_FSSETDM.
@@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */
/*
* Minimum and maximum sizes need for growth checks.
@@ -275,7 +278,8 @@ typedef struct xfs_bstat {
#define bs_projid bs_projid_lo /* (previously just bs_projid) */
__u16 bs_forkoff; /* inode fork offset in bytes */
__u16 bs_projid_hi; /* higher part of project id */
- unsigned char bs_pad[10]; /* pad space, unused */
+ unsigned char bs_pad[6]; /* pad space, unused */
+ __u32 bs_cowextsize; /* cow extent size */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4b9769e23c83..8de9a3a29589 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -256,6 +256,7 @@ xfs_inode_from_disk(
to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
to->di_flags2 = be64_to_cpu(from->di_flags2);
+ to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
}
}
@@ -305,7 +306,7 @@ xfs_inode_to_disk(
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
to->di_flags2 = cpu_to_be64(from->di_flags2);
-
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
@@ -357,6 +358,7 @@ xfs_log_dinode_to_disk(
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(from->di_lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
@@ -373,6 +375,9 @@ xfs_dinode_verify(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
+ uint16_t flags;
+ uint64_t flags2;
+
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return false;
@@ -389,6 +394,23 @@ xfs_dinode_verify(
return false;
if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
+
+ flags = be16_to_cpu(dip->di_flags);
+ flags2 = be64_to_cpu(dip->di_flags2);
+
+ /* don't allow reflink/cowextsize if we don't have reflink */
+ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
+ !xfs_sb_version_hasreflink(&mp->m_sb))
+ return false;
+
+ /* don't let reflink and realtime mix */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+ return false;
+
+ /* don't let reflink and dax mix */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX))
+ return false;
+
return true;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 7c4dd321b215..62d9d4681c8c 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -47,6 +47,7 @@ struct xfs_icdinode {
__uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
__uint64_t di_flags2; /* more random flags */
+ __uint32_t di_cowextsize; /* basic cow extent size for file */
xfs_ictimestamp_t di_crtime; /* time created */
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bbcc8c7a44b3..5dd56d3dbb3a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -121,6 +121,26 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
+ if (unlikely(xfs_is_reflink_inode(ip) &&
+ (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %llu, wrong file type for reflink.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return -EFSCORRUPTED;
+ }
+
+ if (unlikely(xfs_is_reflink_inode(ip) &&
+ (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %llu, has reflink+realtime flag set.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return -EFSCORRUPTED;
+ }
+
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
@@ -186,9 +206,14 @@ xfs_iformat_fork(
XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
return -EFSCORRUPTED;
}
- if (error) {
+ if (error)
return error;
+
+ if (xfs_is_reflink_inode(ip)) {
+ ASSERT(ip->i_cowfp == NULL);
+ xfs_ifork_init_cow(ip);
}
+
if (!XFS_DFORK_Q(dip))
return 0;
@@ -208,7 +233,8 @@ xfs_iformat_fork(
XFS_CORRUPTION_ERROR("xfs_iformat(8)",
XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return -EFSCORRUPTED;
+ error = -EFSCORRUPTED;
+ break;
}
error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -226,6 +252,9 @@ xfs_iformat_fork(
if (error) {
kmem_zone_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
+ if (ip->i_cowfp)
+ kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+ ip->i_cowfp = NULL;
xfs_idestroy_fork(ip, XFS_DATA_FORK);
}
return error;
@@ -740,6 +769,9 @@ xfs_idestroy_fork(
if (whichfork == XFS_ATTR_FORK) {
kmem_zone_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
+ } else if (whichfork == XFS_COW_FORK) {
+ kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+ ip->i_cowfp = NULL;
}
}
@@ -927,6 +959,19 @@ xfs_iext_get_ext(
}
}
+/* Convert bmap state flags to an inode fork. */
+struct xfs_ifork *
+xfs_iext_state_to_fork(
+ struct xfs_inode *ip,
+ int state)
+{
+ if (state & BMAP_COWFORK)
+ return ip->i_cowfp;
+ else if (state & BMAP_ATTRFORK)
+ return ip->i_afp;
+ return &ip->i_df;
+}
+
/*
* Insert new item(s) into the extent records for incore inode
* fork 'ifp'. 'count' new items are inserted at index 'idx'.
@@ -939,7 +984,7 @@ xfs_iext_insert(
xfs_bmbt_irec_t *new, /* items to insert */
int state) /* type of extent conversion */
{
- xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
xfs_extnum_t i; /* extent record index */
trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
@@ -1189,7 +1234,7 @@ xfs_iext_remove(
int ext_diff, /* number of extents to remove */
int state) /* type of extent conversion */
{
- xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
xfs_extnum_t nextents; /* number of extents in file */
int new_size; /* size of extents after removal */
@@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs(
ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
}
}
+
+/*
+ * Initialize an inode's copy-on-write fork.
+ */
+void
+xfs_ifork_init_cow(
+ struct xfs_inode *ip)
+{
+ if (ip->i_cowfp)
+ return;
+
+ ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
+ KM_SLEEP | KM_NOFS);
+ ip->i_cowfp->if_flags = XFS_IFEXTENTS;
+ ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
+ ip->i_cnextents = 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index f95e072ae646..c9476f50e32d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -92,7 +92,9 @@ typedef struct xfs_ifork {
#define XFS_IFORK_PTR(ip,w) \
((w) == XFS_DATA_FORK ? \
&(ip)->i_df : \
- (ip)->i_afp)
+ ((w) == XFS_ATTR_FORK ? \
+ (ip)->i_afp : \
+ (ip)->i_cowfp))
#define XFS_IFORK_DSIZE(ip) \
(XFS_IFORK_Q(ip) ? \
XFS_IFORK_BOFF(ip) : \
@@ -105,26 +107,38 @@ typedef struct xfs_ifork {
#define XFS_IFORK_SIZE(ip,w) \
((w) == XFS_DATA_FORK ? \
XFS_IFORK_DSIZE(ip) : \
- XFS_IFORK_ASIZE(ip))
+ ((w) == XFS_ATTR_FORK ? \
+ XFS_IFORK_ASIZE(ip) : \
+ 0))
#define XFS_IFORK_FORMAT(ip,w) \
((w) == XFS_DATA_FORK ? \
(ip)->i_d.di_format : \
- (ip)->i_d.di_aformat)
+ ((w) == XFS_ATTR_FORK ? \
+ (ip)->i_d.di_aformat : \
+ (ip)->i_cformat))
#define XFS_IFORK_FMT_SET(ip,w,n) \
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_format = (n)) : \
- ((ip)->i_d.di_aformat = (n)))
+ ((w) == XFS_ATTR_FORK ? \
+ ((ip)->i_d.di_aformat = (n)) : \
+ ((ip)->i_cformat = (n))))
#define XFS_IFORK_NEXTENTS(ip,w) \
((w) == XFS_DATA_FORK ? \
(ip)->i_d.di_nextents : \
- (ip)->i_d.di_anextents)
+ ((w) == XFS_ATTR_FORK ? \
+ (ip)->i_d.di_anextents : \
+ (ip)->i_cnextents))
#define XFS_IFORK_NEXT_SET(ip,w,n) \
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_nextents = (n)) : \
- ((ip)->i_d.di_anextents = (n)))
+ ((w) == XFS_ATTR_FORK ? \
+ ((ip)->i_d.di_anextents = (n)) : \
+ ((ip)->i_cnextents = (n))))
#define XFS_IFORK_MAXEXT(ip, w) \
(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
+
int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
@@ -169,4 +183,6 @@ void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
extern struct kmem_zone *xfs_ifork_zone;
+extern void xfs_ifork_init_cow(struct xfs_inode *ip);
+
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index fc5eef85d61e..083cdd6d6c28 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr)
#define XLOG_REG_TYPE_ICREATE 20
#define XLOG_REG_TYPE_RUI_FORMAT 21
#define XLOG_REG_TYPE_RUD_FORMAT 22
-#define XLOG_REG_TYPE_MAX 22
+#define XLOG_REG_TYPE_CUI_FORMAT 23
+#define XLOG_REG_TYPE_CUD_FORMAT 24
+#define XLOG_REG_TYPE_BUI_FORMAT 25
+#define XLOG_REG_TYPE_BUD_FORMAT 26
+#define XLOG_REG_TYPE_MAX 26
/*
* Flags to log operation header
@@ -231,6 +235,10 @@ typedef struct xfs_trans_header {
#define XFS_LI_ICREATE 0x123f
#define XFS_LI_RUI 0x1240 /* rmap update intent */
#define XFS_LI_RUD 0x1241
+#define XFS_LI_CUI 0x1242 /* refcount update intent */
+#define XFS_LI_CUD 0x1243
+#define XFS_LI_BUI 0x1244 /* bmbt update intent */
+#define XFS_LI_BUD 0x1245
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -242,7 +250,11 @@ typedef struct xfs_trans_header {
{ XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
{ XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \
{ XFS_LI_RUI, "XFS_LI_RUI" }, \
- { XFS_LI_RUD, "XFS_LI_RUD" }
+ { XFS_LI_RUD, "XFS_LI_RUD" }, \
+ { XFS_LI_CUI, "XFS_LI_CUI" }, \
+ { XFS_LI_CUD, "XFS_LI_CUD" }, \
+ { XFS_LI_BUI, "XFS_LI_BUI" }, \
+ { XFS_LI_BUD, "XFS_LI_BUD" }
/*
* Inode Log Item Format definitions.
@@ -411,7 +423,8 @@ struct xfs_log_dinode {
__uint64_t di_changecount; /* number of attribute changes */
xfs_lsn_t di_lsn; /* flush sequence */
__uint64_t di_flags2; /* more random flags */
- __uint8_t di_pad2[16]; /* more padding for future expansion */
+ __uint32_t di_cowextsize; /* basic cow extent size for file */
+ __uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
xfs_ictimestamp_t di_crtime; /* time created */
@@ -622,8 +635,11 @@ struct xfs_map_extent {
/* rmap me_flags: upper bits are flags, lower byte is type code */
#define XFS_RMAP_EXTENT_MAP 1
+#define XFS_RMAP_EXTENT_MAP_SHARED 2
#define XFS_RMAP_EXTENT_UNMAP 3
+#define XFS_RMAP_EXTENT_UNMAP_SHARED 4
#define XFS_RMAP_EXTENT_CONVERT 5
+#define XFS_RMAP_EXTENT_CONVERT_SHARED 6
#define XFS_RMAP_EXTENT_ALLOC 7
#define XFS_RMAP_EXTENT_FREE 8
#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF
@@ -671,6 +687,102 @@ struct xfs_rud_log_format {
};
/*
+ * CUI/CUD (refcount update) log format definitions
+ */
+struct xfs_phys_extent {
+ __uint64_t pe_startblock;
+ __uint32_t pe_len;
+ __uint32_t pe_flags;
+};
+
+/* refcount pe_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_refcount_intent_type. */
+#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK)
+
+/*
+ * This is the structure used to lay out a cui log item in the
+ * log. The cui_extents field is a variable size array whose
+ * size is given by cui_nextents.
+ */
+struct xfs_cui_log_format {
+ __uint16_t cui_type; /* cui log item type */
+ __uint16_t cui_size; /* size of this item */
+ __uint32_t cui_nextents; /* # extents to free */
+ __uint64_t cui_id; /* cui identifier */
+ struct xfs_phys_extent cui_extents[]; /* array of extents */
+};
+
+static inline size_t
+xfs_cui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_cui_log_format) +
+ nr * sizeof(struct xfs_phys_extent);
+}
+
+/*
+ * This is the structure used to lay out a cud log item in the
+ * log. The cud_extents array is a variable size array whose
+ * size is given by cud_nextents;
+ */
+struct xfs_cud_log_format {
+ __uint16_t cud_type; /* cud log item type */
+ __uint16_t cud_size; /* size of this item */
+ __uint32_t __pad;
+ __uint64_t cud_cui_id; /* id of corresponding cui */
+};
+
+/*
+ * BUI/BUD (inode block mapping) log format definitions
+ */
+
+/* bmbt me_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_bmap_intent_type. */
+#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31)
+#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30)
+
+#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \
+ XFS_BMAP_EXTENT_ATTR_FORK | \
+ XFS_BMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an bui log item in the
+ * log. The bui_extents field is a variable size array whose
+ * size is given by bui_nextents.
+ */
+struct xfs_bui_log_format {
+ __uint16_t bui_type; /* bui log item type */
+ __uint16_t bui_size; /* size of this item */
+ __uint32_t bui_nextents; /* # extents to free */
+ __uint64_t bui_id; /* bui identifier */
+ struct xfs_map_extent bui_extents[]; /* array of extents to bmap */
+};
+
+static inline size_t
+xfs_bui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_bui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an bud log item in the
+ * log. The bud_extents array is a variable size array whose
+ * size is given by bud_nextents;
+ */
+struct xfs_bud_log_format {
+ __uint16_t bud_type; /* bud log item type */
+ __uint16_t bud_size; /* size of this item */
+ __uint32_t __pad;
+ __uint64_t bud_bui_id; /* id of corresponding bui */
+};
+
+/*
* Dquot Log format definitions.
*
* The first two fields must be the type and size fitting into
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
new file mode 100644
index 000000000000..b177ef33cd4c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_refcount.h"
+#include "xfs_rmap.h"
+
+/* Allowable refcount adjustment amounts. */
+enum xfs_refc_adjust_op {
+ XFS_REFCOUNT_ADJUST_INCREASE = 1,
+ XFS_REFCOUNT_ADJUST_DECREASE = -1,
+ XFS_REFCOUNT_ADJUST_COW_ALLOC = 0,
+ XFS_REFCOUNT_ADJUST_COW_FREE = -1,
+};
+
+STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops);
+STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops);
+
+/*
+ * Look up the first record less than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_le(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ XFS_LOOKUP_LE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Look up the first record greater than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_ge(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ XFS_LOOKUP_GE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/* Convert on-disk record to in-core format. */
+static inline void
+xfs_refcount_btrec_to_irec(
+ union xfs_btree_rec *rec,
+ struct xfs_refcount_irec *irec)
+{
+ irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+ irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
+ irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ xfs_refcount_btrec_to_irec(rec, irec);
+ trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno,
+ irec);
+ }
+ return error;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec)
+{
+ union xfs_btree_rec rec;
+ int error;
+
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
+ rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+ rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
+ rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+ error = xfs_btree_update(cur, &rec);
+ if (error)
+ trace_xfs_refcount_update_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Insert the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_insert(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *i)
+{
+ int error;
+
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+ cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
+ cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
+ cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+ error = xfs_btree_insert(cur, i);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+out_error:
+ if (error)
+ trace_xfs_refcount_insert_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remove the record referred to by cur, then set the pointer to the spot
+ * where the record could be re-inserted, in case we want to increment or
+ * decrement the cursor.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_delete(
+ struct xfs_btree_cur *cur,
+ int *i)
+{
+ struct xfs_refcount_irec irec;
+ int found_rec;
+ int error;
+
+ error = xfs_refcount_get_rec(cur, &irec, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+ error = xfs_btree_delete(cur, i);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+ if (error)
+ goto out_error;
+ error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+out_error:
+ if (error)
+ trace_xfs_refcount_delete_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Adjusting the Reference Count
+ *
+ * As stated elsewhere, the reference count btree (refcbt) stores
+ * >1 reference counts for extents of physical blocks. In this
+ * operation, we're either raising or lowering the reference count of
+ * some subrange stored in the tree:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+-----+ +--+--------+---------
+ * 2 | | 3 | 4 | |17| 55 | 10
+ * ----+ +---+-----+ +--+--------+---------
+ * X axis is physical blocks number;
+ * reference counts are the numbers inside the rectangles
+ *
+ * The first thing we need to do is to ensure that there are no
+ * refcount extents crossing either boundary of the range to be
+ * adjusted. For any extent that does cross a boundary, split it into
+ * two extents so that we can increment the refcount of one of the
+ * pieces later:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+-----+ +--+--------+----+----
+ * 2 | | 3 | 2 | |17| 55 | 10 | 10
+ * ----+ +---+-----+ +--+--------+----+----
+ *
+ * For this next step, let's assume that all the physical blocks in
+ * the adjustment range are mapped to a file and are therefore in use
+ * at least once. Therefore, we can infer that any gap in the
+ * refcount tree within the adjustment range represents a physical
+ * extent with refcount == 1:
+ *
+ * <------ adjustment range ------>
+ * ----+---+---+-----+-+--+--------+----+----
+ * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10
+ * ----+---+---+-----+-+--+--------+----+----
+ * ^
+ *
+ * For each extent that falls within the interval range, figure out
+ * which extent is to the left or the right of that extent. Now we
+ * have a left, current, and right extent. If the new reference count
+ * of the center extent enables us to merge left, center, and right
+ * into one record covering all three, do so. If the center extent is
+ * at the left end of the range, abuts the left extent, and its new
+ * reference count matches the left extent's record, then merge them.
+ * If the center extent is at the right end of the range, abuts the
+ * right extent, and the reference counts match, merge those. In the
+ * example, we can left merge (assuming an increment operation):
+ *
+ * <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ * 2 | 3 | 2 |1|17| 55 | 10 | 10
+ * --------+---+-----+-+--+--------+----+----
+ * ^
+ *
+ * For all other extents within the range, adjust the reference count
+ * or delete it if the refcount falls below 2. If we were
+ * incrementing, the end result looks like this:
+ *
+ * <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ * 2 | 4 | 3 |2|18| 56 | 11 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *
+ * The result of a decrement operation looks as such:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+ +--+--------+----+----
+ * 2 | | 2 | |16| 54 | 9 | 10
+ * ----+ +---+ +--+--------+----+----
+ * DDDD 111111DD
+ *
+ * The blocks marked "D" are freed; the blocks marked "1" are only
+ * referenced once and therefore the record is removed from the
+ * refcount btree.
+ */
+
+/* Next block after this extent. */
+static inline xfs_agblock_t
+xfs_refc_next(
+ struct xfs_refcount_irec *rc)
+{
+ return rc->rc_startblock + rc->rc_blockcount;
+}
+
+/*
+ * Split a refcount extent that crosses agbno.
+ */
+STATIC int
+xfs_refcount_split_extent(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ bool *shape_changed)
+{
+ struct xfs_refcount_irec rcext, tmp;
+ int found_rec;
+ int error;
+
+ *shape_changed = false;
+ error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
+ return 0;
+
+ *shape_changed = true;
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+ &rcext, agbno);
+
+ /* Establish the right extent. */
+ tmp = rcext;
+ tmp.rc_startblock = agbno;
+ tmp.rc_blockcount -= (agbno - rcext.rc_startblock);
+ error = xfs_refcount_update(cur, &tmp);
+ if (error)
+ goto out_error;
+
+ /* Insert the left extent. */
+ tmp = rcext;
+ tmp.rc_blockcount = agbno - rcext.rc_startblock;
+ error = xfs_refcount_insert(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ return error;
+
+out_error:
+ trace_xfs_refcount_split_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge the left, center, and right extents.
+ */
+STATIC int
+xfs_refcount_merge_center_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *center,
+ struct xfs_refcount_irec *right,
+ unsigned long long extlen,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_center_extents(cur->bc_mp,
+ cur->bc_private.a.agno, left, center, right);
+
+ /*
+ * Make sure the center and right extents are not in the btree.
+ * If the center extent was synthesized, the first delete call
+ * removes the right extent and we skip the second deletion.
+ * If center and right were in the btree, then the first delete
+ * call removes the center and the second one removes the right
+ * extent.
+ */
+ error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ if (center->rc_refcount > 1) {
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+ }
+
+ /* Enlarge the left extent. */
+ error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ left->rc_blockcount = extlen;
+ error = xfs_refcount_update(cur, left);
+ if (error)
+ goto out_error;
+
+ *aglen = 0;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge with the left extent.
+ */
+STATIC int
+xfs_refcount_merge_left_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *cleft,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_left_extent(cur->bc_mp,
+ cur->bc_private.a.agno, left, cleft);
+
+ /* If the extent at agbno (cleft) wasn't synthesized, remove it. */
+ if (cleft->rc_refcount > 1) {
+ error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+ }
+
+ /* Enlarge the left extent. */
+ error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ left->rc_blockcount += cleft->rc_blockcount;
+ error = xfs_refcount_update(cur, left);
+ if (error)
+ goto out_error;
+
+ *agbno += cleft->rc_blockcount;
+ *aglen -= cleft->rc_blockcount;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge with the right extent.
+ */
+STATIC int
+xfs_refcount_merge_right_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *right,
+ struct xfs_refcount_irec *cright,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_right_extent(cur->bc_mp,
+ cur->bc_private.a.agno, cright, right);
+
+ /*
+ * If the extent ending at agbno+aglen (cright) wasn't synthesized,
+ * remove it.
+ */
+ if (cright->rc_refcount > 1) {
+ error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+ }
+
+ /* Enlarge the right extent. */
+ error = xfs_refcount_lookup_le(cur, right->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ right->rc_startblock -= cright->rc_blockcount;
+ right->rc_blockcount += cright->rc_blockcount;
+ error = xfs_refcount_update(cur, right);
+ if (error)
+ goto out_error;
+
+ *aglen -= cright->rc_blockcount;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+#define XFS_FIND_RCEXT_SHARED 1
+#define XFS_FIND_RCEXT_COW 2
+/*
+ * Find the left extent and the one after it (cleft). This function assumes
+ * that we've already split any extent crossing agbno.
+ */
+STATIC int
+xfs_refcount_find_left_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *cleft,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ int flags)
+{
+ struct xfs_refcount_irec tmp;
+ int error;
+ int found_rec;
+
+ left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
+ error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ if (xfs_refc_next(&tmp) != agbno)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ return 0;
+ /* We have a left extent; retrieve (or invent) the next right one */
+ *left = tmp;
+
+ error = xfs_btree_increment(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+ if (found_rec) {
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ /* if tmp starts at the end of our range, just use that */
+ if (tmp.rc_startblock == agbno)
+ *cleft = tmp;
+ else {
+ /*
+ * There's a gap in the refcntbt at the start of the
+ * range we're interested in (refcount == 1) so
+ * synthesize the implied extent and pass it back.
+ * We assume here that the agbno/aglen range was
+ * passed in from a data fork extent mapping and
+ * therefore is allocated to exactly one owner.
+ */
+ cleft->rc_startblock = agbno;
+ cleft->rc_blockcount = min(aglen,
+ tmp.rc_startblock - agbno);
+ cleft->rc_refcount = 1;
+ }
+ } else {
+ /*
+ * No extents, so pretend that there's one covering the whole
+ * range.
+ */
+ cleft->rc_startblock = agbno;
+ cleft->rc_blockcount = aglen;
+ cleft->rc_refcount = 1;
+ }
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+ left, cleft, agbno);
+ return error;
+
+out_error:
+ trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find the right extent and the one before it (cright). This function
+ * assumes that we've already split any extents crossing agbno + aglen.
+ */
+STATIC int
+xfs_refcount_find_right_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *right,
+ struct xfs_refcount_irec *cright,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ int flags)
+{
+ struct xfs_refcount_irec tmp;
+ int error;
+ int found_rec;
+
+ right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
+ error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ if (tmp.rc_startblock != agbno + aglen)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ return 0;
+ /* We have a right extent; retrieve (or invent) the next left one */
+ *right = tmp;
+
+ error = xfs_btree_decrement(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+ if (found_rec) {
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ /* if tmp ends at the end of our range, just use that */
+ if (xfs_refc_next(&tmp) == agbno + aglen)
+ *cright = tmp;
+ else {
+ /*
+ * There's a gap in the refcntbt at the end of the
+ * range we're interested in (refcount == 1) so
+ * create the implied extent and pass it back.
+ * We assume here that the agbno/aglen range was
+ * passed in from a data fork extent mapping and
+ * therefore is allocated to exactly one owner.
+ */
+ cright->rc_startblock = max(agbno, xfs_refc_next(&tmp));
+ cright->rc_blockcount = right->rc_startblock -
+ cright->rc_startblock;
+ cright->rc_refcount = 1;
+ }
+ } else {
+ /*
+ * No extents, so pretend that there's one covering the whole
+ * range.
+ */
+ cright->rc_startblock = agbno;
+ cright->rc_blockcount = aglen;
+ cright->rc_refcount = 1;
+ }
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+ cright, right, agbno + aglen);
+ return error;
+
+out_error:
+ trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/* Is this extent valid? */
+static inline bool
+xfs_refc_valid(
+ struct xfs_refcount_irec *rc)
+{
+ return rc->rc_startblock != NULLAGBLOCK;
+}
+
+/*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+STATIC int
+xfs_refcount_merge_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
+ enum xfs_refc_adjust_op adjust,
+ int flags,
+ bool *shape_changed)
+{
+ struct xfs_refcount_irec left = {0}, cleft = {0};
+ struct xfs_refcount_irec cright = {0}, right = {0};
+ int error;
+ unsigned long long ulen;
+ bool cequal;
+
+ *shape_changed = false;
+ /*
+ * Find the extent just below agbno [left], just above agbno [cleft],
+ * just below (agbno + aglen) [cright], and just above (agbno + aglen)
+ * [right].
+ */
+ error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
+ *aglen, flags);
+ if (error)
+ return error;
+ error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
+ *aglen, flags);
+ if (error)
+ return error;
+
+ /* No left or right extent to merge; exit. */
+ if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right))
+ return 0;
+
+ cequal = (cleft.rc_startblock == cright.rc_startblock) &&
+ (cleft.rc_blockcount == cright.rc_blockcount);
+
+ /* Try to merge left, cleft, and right. cleft must == cright. */
+ ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+ right.rc_blockcount;
+ if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+ xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+ left.rc_refcount == cleft.rc_refcount + adjust &&
+ right.rc_refcount == cleft.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+ &right, ulen, agbno, aglen);
+ }
+
+ /* Try to merge left and cleft. */
+ ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+ if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+ left.rc_refcount == cleft.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+ agbno, aglen);
+ if (error)
+ return error;
+
+ /*
+ * If we just merged left + cleft and cleft == cright,
+ * we no longer have a cright to merge with right. We're done.
+ */
+ if (cequal)
+ return 0;
+ }
+
+ /* Try to merge cright and right. */
+ ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+ if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+ right.rc_refcount == cright.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ return xfs_refcount_merge_right_extent(cur, &right, &cright,
+ agbno, aglen);
+ }
+
+ return error;
+}
+
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs. Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 24 bytes
+ * per record. We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ *
+ * XXX: This is a pretty hand-wavy estimate. The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+ * Be conservative here.
+ */
+static bool
+xfs_refcount_still_have_space(
+ struct xfs_btree_cur *cur)
+{
+ unsigned long overhead;
+
+ overhead = cur->bc_private.a.priv.refc.shape_changes *
+ xfs_allocfree_log_count(cur->bc_mp, 1);
+ overhead *= cur->bc_mp->m_sb.sb_blocksize;
+
+ /*
+ * Only allow 2 refcount extent updates per transaction if the
+ * refcount continue update "error" has been injected.
+ */
+ if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+ XFS_TEST_ERROR(false, cur->bc_mp,
+ XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE,
+ XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE))
+ return false;
+
+ if (cur->bc_private.a.priv.refc.nr_ops == 0)
+ return true;
+ else if (overhead > cur->bc_tp->t_log_res)
+ return false;
+ return cur->bc_tp->t_log_res - overhead >
+ cur->bc_private.a.priv.refc.nr_ops * 32;
+}
+
+/*
+ * Adjust the refcounts of middle extents. At this point we should have
+ * split extents that crossed the adjustment range; merged with adjacent
+ * extents; and updated agbno/aglen to reflect the merges. Therefore,
+ * all we have to do is update the extents inside [agbno, agbno + aglen].
+ */
+STATIC int
+xfs_refcount_adjust_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_refcount_irec ext, tmp;
+ int error;
+ int found_rec, found_tmp;
+ xfs_fsblock_t fsbno;
+
+ /* Merging did all the work already. */
+ if (*aglen == 0)
+ return 0;
+
+ error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+ if (error)
+ goto out_error;
+
+ while (*aglen > 0 && xfs_refcount_still_have_space(cur)) {
+ error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec) {
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_blockcount = 0;
+ ext.rc_refcount = 0;
+ }
+
+ /*
+ * Deal with a hole in the refcount tree; if a file maps to
+ * these blocks and there's no refcountbt record, pretend that
+ * there is one with refcount == 1.
+ */
+ if (ext.rc_startblock != *agbno) {
+ tmp.rc_startblock = *agbno;
+ tmp.rc_blockcount = min(*aglen,
+ ext.rc_startblock - *agbno);
+ tmp.rc_refcount = 1 + adj;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &tmp);
+
+ /*
+ * Either cover the hole (increment) or
+ * delete the range (decrement).
+ */
+ if (tmp.rc_refcount) {
+ error = xfs_refcount_insert(cur, &tmp,
+ &found_tmp);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_tmp == 1, out_error);
+ cur->bc_private.a.priv.refc.nr_ops++;
+ } else {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_private.a.agno,
+ tmp.rc_startblock);
+ xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+ tmp.rc_blockcount, oinfo);
+ }
+
+ (*agbno) += tmp.rc_blockcount;
+ (*aglen) -= tmp.rc_blockcount;
+
+ error = xfs_refcount_lookup_ge(cur, *agbno,
+ &found_rec);
+ if (error)
+ goto out_error;
+ }
+
+ /* Stop if there's nothing left to modify */
+ if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+ break;
+
+ /*
+ * Adjust the reference count and either update the tree
+ * (incr) or free the blocks (decr).
+ */
+ if (ext.rc_refcount == MAXREFCOUNT)
+ goto skip;
+ ext.rc_refcount += adj;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &ext);
+ if (ext.rc_refcount > 1) {
+ error = xfs_refcount_update(cur, &ext);
+ if (error)
+ goto out_error;
+ cur->bc_private.a.priv.refc.nr_ops++;
+ } else if (ext.rc_refcount == 1) {
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_rec == 1, out_error);
+ cur->bc_private.a.priv.refc.nr_ops++;
+ goto advloop;
+ } else {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_private.a.agno,
+ ext.rc_startblock);
+ xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+ ext.rc_blockcount, oinfo);
+ }
+
+skip:
+ error = xfs_btree_increment(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+
+advloop:
+ (*agbno) += ext.rc_blockcount;
+ (*aglen) -= ext.rc_blockcount;
+ }
+
+ return error;
+out_error:
+ trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/* Adjust the reference count of a range of AG blocks. */
+STATIC int
+xfs_refcount_adjust(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *new_agbno,
+ xfs_extlen_t *new_aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops,
+ struct xfs_owner_info *oinfo)
+{
+ bool shape_changed;
+ int shape_changes = 0;
+ int error;
+
+ *new_agbno = agbno;
+ *new_aglen = aglen;
+ if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+ agbno, aglen);
+ else
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+ agbno, aglen);
+
+ /*
+ * Ensure that no rcextents cross the boundary of the adjustment range.
+ */
+ error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+
+ error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+
+ /*
+ * Try to merge with the left or right extents of the range.
+ */
+ error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
+ XFS_FIND_RCEXT_SHARED, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+ if (shape_changes)
+ cur->bc_private.a.priv.refc.shape_changes++;
+
+ /* Now that we've taken care of the ends, adjust the middle extents */
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
+ adj, dfops, oinfo);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Clean up after calling xfs_refcount_finish_one. */
+void
+xfs_refcount_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_private.a.agbp;
+ xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred refcount operations. We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_refcount_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *rcur;
+ struct xfs_buf *agbp = NULL;
+ int error = 0;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_agblock_t new_agbno;
+ unsigned long nr_ops = 0;
+ int shape_changes = 0;
+
+ agno = XFS_FSB_TO_AGNO(mp, startblock);
+ ASSERT(agno != NULLAGNUMBER);
+ bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+ type, XFS_FSB_TO_AGBNO(mp, startblock),
+ blockcount);
+
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_REFCOUNT_FINISH_ONE,
+ XFS_RANDOM_REFCOUNT_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ rcur = *pcur;
+ if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
+ shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+ xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ error = xfs_alloc_read_agf(tp->t_mountp, tp, agno,
+ XFS_ALLOC_FLAG_FREEING, &agbp);
+ if (error)
+ return error;
+ if (!agbp)
+ return -EFSCORRUPTED;
+
+ rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops);
+ if (!rcur) {
+ error = -ENOMEM;
+ goto out_cur;
+ }
+ rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
+ rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+ }
+ *pcur = rcur;
+
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL);
+ *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL);
+ *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ *new_fsb = startblock + blockcount;
+ *new_len = 0;
+ error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops);
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ *new_fsb = startblock + blockcount;
+ *new_len = 0;
+ error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ if (!error && *new_len > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, agno, type,
+ bno, blockcount, new_agbno, *new_len);
+ return error;
+
+out_cur:
+ xfs_trans_brelse(tp, agbp);
+
+ return error;
+}
+
+/*
+ * Record a refcount intent for later processing.
+ */
+static int
+__xfs_refcount_add(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount)
+{
+ struct xfs_refcount_intent *ri;
+
+ trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock),
+ type, XFS_FSB_TO_AGBNO(mp, startblock),
+ blockcount);
+
+ ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
+ KM_SLEEP | KM_NOFS);
+ INIT_LIST_HEAD(&ri->ri_list);
+ ri->ri_type = type;
+ ri->ri_startblock = startblock;
+ ri->ri_blockcount = blockcount;
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+ return 0;
+}
+
+/*
+ * Increase the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_increase_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE,
+ PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Decrease the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_decrease_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE,
+ PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen. If
+ * find_end_of_shared is set, return the longest contiguous extent of
+ * shared blocks; if not, just return the first extent we find. If no
+ * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
+ * and 0, respectively.
+ */
+int
+xfs_refcount_find_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *fbno,
+ xfs_extlen_t *flen,
+ bool find_end_of_shared)
+{
+ struct xfs_refcount_irec tmp;
+ int i;
+ int have;
+ int error;
+
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+ agbno, aglen);
+
+ /* By default, skip the whole range */
+ *fbno = NULLAGBLOCK;
+ *flen = 0;
+
+ /* Try to find a refcount extent that crosses the start */
+ error = xfs_refcount_lookup_le(cur, agbno, &have);
+ if (error)
+ goto out_error;
+ if (!have) {
+ /* No left extent, look at the next one */
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ goto done;
+ }
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+
+ /* If the extent ends before the start, look at the next one */
+ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ goto done;
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ }
+
+ /* If the extent starts after the range we want, bail out */
+ if (tmp.rc_startblock >= agbno + aglen)
+ goto done;
+
+ /* We found the start of a shared extent! */
+ if (tmp.rc_startblock < agbno) {
+ tmp.rc_blockcount -= (agbno - tmp.rc_startblock);
+ tmp.rc_startblock = agbno;
+ }
+
+ *fbno = tmp.rc_startblock;
+ *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno);
+ if (!find_end_of_shared)
+ goto done;
+
+ /* Otherwise, find the end of this shared extent */
+ while (*fbno + *flen < agbno + aglen) {
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ break;
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (tmp.rc_startblock >= agbno + aglen ||
+ tmp.rc_startblock != *fbno + *flen)
+ break;
+ *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
+ }
+
+done:
+ trace_xfs_refcount_find_shared_result(cur->bc_mp,
+ cur->bc_private.a.agno, *fbno, *flen);
+
+out_error:
+ if (error)
+ trace_xfs_refcount_find_shared_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Recovering CoW Blocks After a Crash
+ *
+ * Due to the way that the copy on write mechanism works, there's a window of
+ * opportunity in which we can lose track of allocated blocks during a crash.
+ * Because CoW uses delayed allocation in the in-core CoW fork, writeback
+ * causes blocks to be allocated and stored in the CoW fork. The blocks are
+ * no longer in the free space btree but are not otherwise recorded anywhere
+ * until the write completes and the blocks are mapped into the file. A crash
+ * in between allocation and remapping results in the replacement blocks being
+ * lost. This situation is exacerbated by the CoW extent size hint because
+ * allocations can hang around for long time.
+ *
+ * However, there is a place where we can record these allocations before they
+ * become mappings -- the reference count btree. The btree does not record
+ * extents with refcount == 1, so we can record allocations with a refcount of
+ * 1. Blocks being used for CoW writeout cannot be shared, so there should be
+ * no conflict with shared block records. These mappings should be created
+ * when we allocate blocks to the CoW fork and deleted when they're removed
+ * from the CoW fork.
+ *
+ * Minor nit: records for in-progress CoW allocations and records for shared
+ * extents must never be merged, to preserve the property that (except for CoW
+ * allocations) there are no refcount btree entries with refcount == 1. The
+ * only time this could potentially happen is when unsharing a block that's
+ * adjacent to CoW allocations, so we must be careful to avoid this.
+ *
+ * At mount time we recover lost CoW allocations by searching the refcount
+ * btree for these refcount == 1 mappings. These represent CoW allocations
+ * that were in progress at the time the filesystem went down, so we can free
+ * them to get the space back.
+ *
+ * This mechanism is superior to creating EFIs for unmapped CoW extents for
+ * several reasons -- first, EFIs pin the tail of the log and would have to be
+ * periodically relogged to avoid filling up the log. Second, CoW completions
+ * will have to file an EFD and create new EFIs for whatever remains in the
+ * CoW fork; this partially takes care of (1) but extent-size reservations
+ * will have to periodically relog even if there's no writeout in progress.
+ * This can happen if the CoW extent size hint is set, which you really want.
+ * Third, EFIs cannot currently be automatically relogged into newer
+ * transactions to advance the log tail. Fourth, stuffing the log full of
+ * EFIs places an upper bound on the number of CoW allocations that can be
+ * held filesystem-wide at any given time. Recording them in the refcount
+ * btree doesn't require us to maintain any state in memory and doesn't pin
+ * the log.
+ */
+/*
+ * Adjust the refcounts of CoW allocations. These allocations are "magic"
+ * in that they're not referenced anywhere else in the filesystem, so we
+ * stash them in the refcount btree with a refcount of 1 until either file
+ * remapping (or CoW cancellation) happens.
+ */
+STATIC int
+xfs_refcount_adjust_cow_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_refcount_irec ext, tmp;
+ int error;
+ int found_rec, found_tmp;
+
+ if (aglen == 0)
+ return 0;
+
+ /* Find any overlapping refcount records */
+ error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+ if (error)
+ goto out_error;
+ error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec) {
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
+ XFS_REFC_COW_START;
+ ext.rc_blockcount = 0;
+ ext.rc_refcount = 0;
+ }
+
+ switch (adj) {
+ case XFS_REFCOUNT_ADJUST_COW_ALLOC:
+ /* Adding a CoW reservation, there should be nothing here. */
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_startblock >= agbno + aglen, out_error);
+
+ tmp.rc_startblock = agbno;
+ tmp.rc_blockcount = aglen;
+ tmp.rc_refcount = 1;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &tmp);
+
+ error = xfs_refcount_insert(cur, &tmp,
+ &found_tmp);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_tmp == 1, out_error);
+ break;
+ case XFS_REFCOUNT_ADJUST_COW_FREE:
+ /* Removing a CoW reservation, there should be one extent. */
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_startblock == agbno, out_error);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_blockcount == aglen, out_error);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_refcount == 1, out_error);
+
+ ext.rc_refcount = 0;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &ext);
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_rec == 1, out_error);
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ return error;
+out_error:
+ trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Add or remove refcount btree entries for CoW reservations.
+ */
+STATIC int
+xfs_refcount_adjust_cow(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops)
+{
+ bool shape_changed;
+ int error;
+
+ agbno += XFS_REFC_COW_START;
+
+ /*
+ * Ensure that no rcextents cross the boundary of the adjustment range.
+ */
+ error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ if (error)
+ goto out_error;
+
+ error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ if (error)
+ goto out_error;
+
+ /*
+ * Try to merge with the left or right extents of the range.
+ */
+ error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
+ XFS_FIND_RCEXT_COW, &shape_changed);
+ if (error)
+ goto out_error;
+
+ /* Now that we've taken care of the ends, adjust the middle extents */
+ error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj,
+ dfops, NULL);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Record a CoW allocation in the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_alloc(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops)
+{
+ int error;
+
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+ agbno, aglen);
+
+ /* Add refcount btree reservation */
+ error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
+ if (error)
+ return error;
+
+ /* Add rmap entry */
+ if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+ error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
+ rcur->bc_private.a.agno,
+ agbno, aglen, XFS_RMAP_OWN_COW);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
+/*
+ * Remove a CoW allocation from the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_free(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops)
+{
+ int error;
+
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+ agbno, aglen);
+
+ /* Remove refcount btree reservation */
+ error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
+ if (error)
+ return error;
+
+ /* Remove rmap entry */
+ if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+ error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
+ rcur->bc_private.a.agno,
+ agbno, aglen, XFS_RMAP_OWN_COW);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
+/* Record a CoW staging extent in the refcount btree. */
+int
+xfs_refcount_alloc_cow_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t fsb,
+ xfs_extlen_t len)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+ fsb, len);
+}
+
+/* Forget a CoW staging event in the refcount btree. */
+int
+xfs_refcount_free_cow_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t fsb,
+ xfs_extlen_t len)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
+ fsb, len);
+}
+
+struct xfs_refcount_recovery {
+ struct list_head rr_list;
+ struct xfs_refcount_irec rr_rrec;
+};
+
+/* Stuff an extent on the recovery list. */
+STATIC int
+xfs_refcount_recover_extent(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct list_head *debris = priv;
+ struct xfs_refcount_recovery *rr;
+
+ if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+ return -EFSCORRUPTED;
+
+ rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+ xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
+ list_add_tail(&rr->rr_list, debris);
+
+ return 0;
+}
+
+/* Find and remove leftover CoW reservations. */
+int
+xfs_refcount_recover_cow_leftovers(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_trans *tp;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_refcount_recovery *rr, *n;
+ struct list_head debris;
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t fsb;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+ return -EOPNOTSUPP;
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+ cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+ /* Find all the leftover CoW staging extents. */
+ INIT_LIST_HEAD(&debris);
+ memset(&low, 0, sizeof(low));
+ memset(&high, 0, sizeof(high));
+ low.rc.rc_startblock = XFS_REFC_COW_START;
+ high.rc.rc_startblock = -1U;
+ error = xfs_btree_query_range(cur, &low, &high,
+ xfs_refcount_recover_extent, &debris);
+ if (error)
+ goto out_cursor;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+
+ /* Now iterate the list to free the leftovers */
+ list_for_each_entry(rr, &debris, rr_list) {
+ /* Set up transaction. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ goto out_free;
+
+ trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec);
+
+ /* Free the orphan record */
+ xfs_defer_init(&dfops, &fsb);
+ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
+ fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
+ error = xfs_refcount_free_cow_extent(mp, &dfops, fsb,
+ rr->rr_rrec.rc_blockcount);
+ if (error)
+ goto out_defer;
+
+ /* Free the block. */
+ xfs_bmap_add_free(mp, &dfops, fsb,
+ rr->rr_rrec.rc_blockcount, NULL);
+
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+ goto out_defer;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_free;
+ }
+
+out_free:
+ /* Free the leftover list */
+ list_for_each_entry_safe(rr, n, &debris, rr_list) {
+ list_del(&rr->rr_list);
+ kmem_free(rr);
+ }
+ return error;
+
+out_cursor:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_buf_relse(agbp);
+ goto out_free;
+
+out_defer:
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ goto out_free;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
new file mode 100644
index 000000000000..098dc668ab2c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_H__
+#define __XFS_REFCOUNT_H__
+
+extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec, int *stat);
+
+enum xfs_refcount_intent_type {
+ XFS_REFCOUNT_INCREASE = 1,
+ XFS_REFCOUNT_DECREASE,
+ XFS_REFCOUNT_ALLOC_COW,
+ XFS_REFCOUNT_FREE_COW,
+};
+
+struct xfs_refcount_intent {
+ struct list_head ri_list;
+ enum xfs_refcount_intent_type ri_type;
+ xfs_fsblock_t ri_startblock;
+ xfs_extlen_t ri_blockcount;
+};
+
+extern int xfs_refcount_increase_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+extern int xfs_refcount_decrease_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+
+extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur, int error);
+extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+ struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock, xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur);
+
+extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+ xfs_extlen_t *flen, bool find_end_of_shared);
+
+extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+ xfs_agnumber_t agno);
+
+#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
new file mode 100644
index 000000000000..453bb2757ec2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_rmap.h"
+
+static struct xfs_btree_cur *
+xfs_refcountbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_private.a.dfops);
+}
+
+STATIC void
+xfs_refcountbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_refcount_root = ptr->s;
+ be32_add_cpu(&agf->agf_refcount_level, inc);
+ pag->pagf_refcount_level += inc;
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp,
+ XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+}
+
+STATIC int
+xfs_refcountbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_alloc_arg args; /* block allocation args */
+ int error; /* error return value */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_refc_block(args.mp));
+ args.firstblock = args.fsbno;
+ xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+ args.minlen = args.maxlen = args.prod = 1;
+ args.resv = XFS_AG_RESV_METADATA;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto out_error;
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ args.agbno, 1);
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.agno == cur->bc_private.a.agno);
+ ASSERT(args.len == 1);
+
+ new->s = cpu_to_be32(args.agbno);
+ be32_add_cpu(&agf->agf_refcount_blocks, 1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out_error:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_refcountbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ struct xfs_owner_info oinfo;
+ int error;
+
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+ be32_add_cpu(&agf->agf_refcount_blocks, -1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+ XFS_AG_RESV_METADATA);
+ if (error)
+ return error;
+
+ return error;
+}
+
+STATIC int
+xfs_refcountbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_refc_mnr[level != 0];
+}
+
+STATIC int
+xfs_refcountbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_refc_mxr[level != 0];
+}
+
+STATIC void
+xfs_refcountbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_refcountbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->refc.rc_startblock);
+ x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+ key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_refcountbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+ rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+ rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_refcountbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_refcount_root != 0);
+
+ ptr->s = agf->agf_refcount_root;
+}
+
+STATIC __int64_t
+xfs_refcountbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
+ struct xfs_refcount_key *kp = &key->refc;
+
+ return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+}
+
+STATIC __int64_t
+xfs_refcountbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC bool
+xfs_refcountbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+
+ if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ return false;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return false;
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
+ return false;
+
+ level = be16_to_cpu(block->bb_level);
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_refcount_level)
+ return false;
+ } else if (level >= mp->m_refc_maxlevels)
+ return false;
+
+ return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+}
+
+STATIC void
+xfs_refcountbt_read_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, -EFSBADCRC);
+ else if (!xfs_refcountbt_verify(bp))
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
+}
+
+STATIC void
+xfs_refcountbt_write_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_refcountbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
+ .name = "xfs_refcountbt",
+ .verify_read = xfs_refcountbt_read_verify,
+ .verify_write = xfs_refcountbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_refcountbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->refc.rc_startblock) <
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_refcountbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->refc.rc_startblock) +
+ be32_to_cpu(r1->refc.rc_blockcount) <=
+ be32_to_cpu(r2->refc.rc_startblock);
+}
+#endif
+
+static const struct xfs_btree_ops xfs_refcountbt_ops = {
+ .rec_len = sizeof(struct xfs_refcount_rec),
+ .key_len = sizeof(struct xfs_refcount_key),
+
+ .dup_cursor = xfs_refcountbt_dup_cursor,
+ .set_root = xfs_refcountbt_set_root,
+ .alloc_block = xfs_refcountbt_alloc_block,
+ .free_block = xfs_refcountbt_free_block,
+ .get_minrecs = xfs_refcountbt_get_minrecs,
+ .get_maxrecs = xfs_refcountbt_get_maxrecs,
+ .init_key_from_rec = xfs_refcountbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur,
+ .key_diff = xfs_refcountbt_key_diff,
+ .buf_ops = &xfs_refcountbt_buf_ops,
+ .diff_two_keys = xfs_refcountbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_refcountbt_keys_inorder,
+ .recs_inorder = xfs_refcountbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new refcount btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ struct xfs_defer_ops *dfops)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
+
+ ASSERT(agno != NULLAGNUMBER);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = XFS_BTNUM_REFC;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_ops = &xfs_refcountbt_ops;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+ cur->bc_private.a.dfops = dfops;
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ cur->bc_private.a.priv.refc.nr_ops = 0;
+ cur->bc_private.a.priv.refc.shape_changes = 0;
+
+ return cur;
+}
+
+/*
+ * Calculate the number of records in a refcount btree block.
+ */
+int
+xfs_refcountbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
+/* Compute the maximum height of a refcount btree. */
+void
+xfs_refcountbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+ struct xfs_mount *mp)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_refc_mxr[0] == 0)
+ return 0;
+
+ return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agf *agf;
+ xfs_extlen_t tree_len;
+ int error;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ *ask += xfs_refcountbt_max_size(mp);
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+
+ agf = XFS_BUF_TO_AGF(agbp);
+ tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+ xfs_buf_relse(agbp);
+
+ *used += tree_len;
+
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
new file mode 100644
index 000000000000..3be7768bd51a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_BTREE_H__
+#define __XFS_REFCOUNT_BTREE_H__
+
+/*
+ * Reference Count Btree on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size
+ */
+#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_REFCOUNT_REC_ADDR(block, index) \
+ ((struct xfs_refcount_rec *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct xfs_refcount_rec))))
+
+#define XFS_REFCOUNT_KEY_ADDR(block, index) \
+ ((struct xfs_refcount_key *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct xfs_refcount_key)))
+
+#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_refcount_ptr_t *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct xfs_refcount_key) + \
+ ((index) - 1) * sizeof(xfs_refcount_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno,
+ struct xfs_defer_ops *dfops);
+extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
+ bool leaf);
+extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+ xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
+#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 73d05407d663..3a8cc7139912 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -148,6 +148,37 @@ done:
return error;
}
+STATIC int
+xfs_rmap_delete(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int i;
+ int error;
+
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ len, owner, offset, flags);
+
+ error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+
+ error = xfs_btree_delete(rcur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+ if (error)
+ trace_xfs_rmap_delete_error(rcur->bc_mp,
+ rcur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
static int
xfs_rmap_btrec_to_irec(
union xfs_btree_rec *rec,
@@ -180,6 +211,160 @@ xfs_rmap_get_rec(
return xfs_rmap_btrec_to_irec(rec, irec);
}
+struct xfs_find_left_neighbor_info {
+ struct xfs_rmap_irec high;
+ struct xfs_rmap_irec *irec;
+ int *stat;
+};
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_find_left_neighbor_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_find_left_neighbor_info *info = priv;
+
+ trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
+ cur->bc_private.a.agno, rec->rm_startblock,
+ rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+ rec->rm_flags);
+
+ if (rec->rm_owner != info->high.rm_owner)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ *info->irec = *rec;
+ *info->stat = 1;
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and adjacent physical and logical
+ * block ranges.
+ */
+int
+xfs_rmap_find_left_neighbor(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_find_left_neighbor_info info;
+ int error;
+
+ *stat = 0;
+ if (bno == 0)
+ return 0;
+ info.high.rm_startblock = bno - 1;
+ info.high.rm_owner = owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(owner) &&
+ !(flags & XFS_RMAP_BMBT_BLOCK)) {
+ if (offset == 0)
+ return 0;
+ info.high.rm_offset = offset - 1;
+ } else
+ info.high.rm_offset = 0;
+ info.high.rm_flags = flags;
+ info.high.rm_blockcount = 0;
+ info.irec = irec;
+ info.stat = stat;
+
+ trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
+ cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+ if (*stat)
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner,
+ irec->rm_offset, irec->rm_flags);
+ return error;
+}
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_lookup_le_range_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_find_left_neighbor_info *info = priv;
+
+ trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
+ cur->bc_private.a.agno, rec->rm_startblock,
+ rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+ rec->rm_flags);
+
+ if (rec->rm_owner != info->high.rm_owner)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ (rec->rm_offset > info->high.rm_offset ||
+ rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ *info->irec = *rec;
+ *info->stat = 1;
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and overlapping physical and logical
+ * block ranges. This is the overlapping-interval version of
+ * xfs_rmap_lookup_le.
+ */
+int
+xfs_rmap_lookup_le_range(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_find_left_neighbor_info info;
+ int error;
+
+ info.high.rm_startblock = bno;
+ info.high.rm_owner = owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK))
+ info.high.rm_offset = offset;
+ else
+ info.high.rm_offset = 0;
+ info.high.rm_flags = flags;
+ info.high.rm_blockcount = 0;
+ *stat = 0;
+ info.irec = irec;
+ info.stat = stat;
+
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp,
+ cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+ if (*stat)
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner,
+ irec->rm_offset, irec->rm_flags);
+ return error;
+}
+
/*
* Find the extent in the rmap btree and remove it.
*
@@ -1093,11 +1278,704 @@ done:
return error;
}
+/*
+ * Convert an unwritten extent to a real extent or vice versa. If there is no
+ * possibility of overlapping extents, delegate to the simpler convert
+ * function.
+ */
+STATIC int
+xfs_rmap_convert_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ /* new is 3 */
+ uint64_t owner;
+ uint64_t offset;
+ uint64_t new_endoff;
+ unsigned int oldext;
+ unsigned int newext;
+ unsigned int flags = 0;
+ int i;
+ int state = 0;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+ oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+ new_endoff = offset + len;
+ trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for and exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ &PREV, &i);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+ ASSERT(PREV.rm_offset <= offset);
+ ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+ ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+ newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.rm_offset == offset)
+ state |= RMAP_LEFT_FILLING;
+ if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+ state |= RMAP_RIGHT_FILLING;
+
+ /* Is there a left record that abuts our range? */
+ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext,
+ &LEFT, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_LEFT_VALID;
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+ done);
+ if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
+ state |= RMAP_LEFT_CONTIG;
+ }
+
+ /* Is there a right record that abuts our range? */
+ error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+ newext, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_RIGHT_VALID;
+ error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+ done);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+ state |= RMAP_RIGHT_CONTIG;
+ }
+
+ /* check that left + prev + right is not too long */
+ if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+ (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+ (unsigned long)LEFT.rm_blockcount + len +
+ RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+ state &= ~RMAP_RIGHT_CONTIG;
+
+ trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ _RET_IP_);
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (error)
+ goto done;
+ error = xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ error = xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += PREV.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (error)
+ goto done;
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += RIGHT.rm_blockcount;
+ NEW.rm_flags = RIGHT.rm_flags;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount = offset - NEW.rm_offset;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ NEW = RIGHT;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset = offset;
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+ if (error)
+ goto done;
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ /* new right extent - oldext */
+ NEW.rm_startblock = bno + len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = new_endoff;
+ NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+ new_endoff;
+ NEW.rm_flags = PREV.rm_flags;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount = offset - NEW.rm_offset;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ /* new middle extent - newext */
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount = len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_CONTIG:
+ case RMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+done:
+ if (error)
+ trace_xfs_rmap_convert_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
#undef NEW
#undef LEFT
#undef RIGHT
#undef PREV
+/*
+ * Find an extent in the rmap btree and unmap it. For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _free function.
+ */
+STATIC int
+xfs_rmap_unmap_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * We should always have a left record because there's a static record
+ * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+ * will not ever be removed from the tree.
+ */
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ &ltrec, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ ltoff = ltrec.rm_offset;
+
+ /* Make sure the extent we found covers the entire freeing range. */
+ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+ ltrec.rm_startblock + ltrec.rm_blockcount >=
+ bno + len, out_error);
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error);
+
+ /* Make sure the unwritten flag matches. */
+ XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+ /* Check the offset. */
+ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error);
+ XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount,
+ out_error);
+
+ if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+ /* Exact match, simply remove the record from rmap tree. */
+ error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock == bno) {
+ /*
+ * Overlap left hand side of extent: move the start, trim the
+ * length and update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+
+ /* Delete prev rmap. */
+ error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+
+ /* Add an rmap at the new offset. */
+ ltrec.rm_startblock += len;
+ ltrec.rm_blockcount -= len;
+ ltrec.rm_offset += len;
+ error = xfs_rmap_insert(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+ /*
+ * Overlap right hand side of extent: trim the length and
+ * update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ ltrec.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * Overlap middle of extent: trim the length of the existing
+ * record to the length of the new left-extent size, increment
+ * the insertion position so we can insert a new record
+ * containing the remaining right-extent space.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrr| |rrrr|
+ * bno len
+ */
+ xfs_extlen_t orig_len = ltrec.rm_blockcount;
+
+ /* Shrink the left side of the rmap */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+
+ /* Add an rmap at the new offset */
+ error = xfs_rmap_insert(cur, bno + len,
+ orig_len - len - ltrec.rm_blockcount,
+ ltrec.rm_owner, offset + len,
+ ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ }
+
+ trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_unmap_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find an extent in the rmap btree and map it. For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _alloc function.
+ */
+STATIC int
+xfs_rmap_map_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /* Is there a left record that abuts our range? */
+ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
+ &ltrec, &have_lt);
+ if (error)
+ goto out_error;
+ if (have_lt &&
+ !xfs_rmap_is_mergeable(&ltrec, owner, flags))
+ have_lt = 0;
+
+ /* Is there a right record that abuts our range? */
+ error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+ flags, &have_gt);
+ if (error)
+ goto out_error;
+ if (have_gt) {
+ error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+
+ if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+ have_gt = 0;
+ }
+
+ if (have_lt &&
+ ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+ ltrec.rm_offset + ltrec.rm_blockcount == offset) {
+ /*
+ * Left edge contiguous, merge into left record.
+ *
+ * ltbno ltlen
+ * orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount += len;
+ if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ offset + len == gtrec.rm_offset) {
+ /*
+ * Right edge also contiguous, delete right record
+ * and merge into left record.
+ *
+ * ltbno ltlen gtbno gtlen
+ * orig: |ooooooooo| |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+ */
+ ltrec.rm_blockcount += gtrec.rm_blockcount;
+ error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+ }
+
+ /* Point the cursor back to the left record and update. */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ offset + len == gtrec.rm_offset) {
+ /*
+ * Right edge contiguous, merge into right record.
+ *
+ * gtbno gtlen
+ * Orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * Result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ /* Delete the old record. */
+ error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+
+ /* Move the start and re-add it. */
+ gtrec.rm_startblock = bno;
+ gtrec.rm_blockcount += len;
+ gtrec.rm_offset = offset;
+ error = xfs_rmap_insert(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * No contiguous edge with identical owner, insert
+ * new record at current cursor position.
+ */
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, flags);
+ if (error)
+ goto out_error;
+ }
+
+ trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_map_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
struct xfs_rmap_query_range_info {
xfs_rmap_query_range_fn fn;
void *priv;
@@ -1237,15 +2115,27 @@ xfs_rmap_finish_one(
case XFS_RMAP_MAP:
error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
break;
+ case XFS_RMAP_MAP_SHARED:
+ error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
case XFS_RMAP_FREE:
case XFS_RMAP_UNMAP:
error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
&oinfo);
break;
+ case XFS_RMAP_UNMAP_SHARED:
+ error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
case XFS_RMAP_CONVERT:
error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
&oinfo);
break;
+ case XFS_RMAP_CONVERT_SHARED:
+ error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+ !unwritten, &oinfo);
+ break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
@@ -1263,9 +2153,10 @@ out_cur:
*/
static bool
xfs_rmap_update_is_needed(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ int whichfork)
{
- return xfs_sb_version_hasrmapbt(&mp->m_sb);
+ return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;
}
/*
@@ -1311,10 +2202,11 @@ xfs_rmap_map_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
return 0;
- return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+ return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+ XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
whichfork, PREV);
}
@@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
return 0;
- return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+ return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+ XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
whichfork, PREV);
}
@@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
return 0;
- return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+ return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+ XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
whichfork, PREV);
}
@@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent(
{
struct xfs_bmbt_irec bmap;
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
return 0;
bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
@@ -1386,7 +2280,7 @@ xfs_rmap_free_extent(
{
struct xfs_bmbt_irec bmap;
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
return 0;
bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 71cf99a4acba..789930599339 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
+int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 17b8eeb34ac8..83e672ff7577 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -35,6 +35,7 @@
#include "xfs_cksum.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
/*
* Reverse map btree.
@@ -512,6 +513,83 @@ void
xfs_rmapbt_compute_maxlevels(
struct xfs_mount *mp)
{
- mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
- mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ /*
+ * On a non-reflink filesystem, the maximum number of rmap
+ * records is the number of blocks in the AG, hence the max
+ * rmapbt height is log_$maxrecs($agblocks). However, with
+ * reflink each AG block can have up to 2^32 (per the refcount
+ * record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records.
+ *
+ * That effectively means that the max rmapbt height must be
+ * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
+ * blocks to feed the rmapbt long before the rmapbt reaches
+ * maximum height. The reflink code uses ag_resv_critical to
+ * disallow reflinking when less than 10% of the per-AG metadata
+ * block reservation since the fallback is a regular file copy.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
+ else
+ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+ struct xfs_mount *mp)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rmap_mxr[0] == 0)
+ return 0;
+
+ return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agf *agf;
+ xfs_extlen_t pool_len;
+ xfs_extlen_t tree_len;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ /* Reserve 1% of the AG or enough for 1 block per record. */
+ pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
+ *ask += pool_len;
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+
+ agf = XFS_BUF_TO_AGF(agbp);
+ tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+ xfs_buf_relse(agbp);
+
+ *used += tree_len;
+
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index e73a55357dab..2a9ac472fb15 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
+ xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4aecc5fefe96..a70aec910626 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -38,6 +38,8 @@
#include "xfs_ialloc_btree.h"
#include "xfs_log.h"
#include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -737,6 +739,13 @@ xfs_sb_mount_common(
mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+ mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+ true);
+ mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+ false);
+ mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
+ mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 0c5b30bd884c..c6f4eb46fe26 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -122,6 +123,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_INO_REF 2
#define XFS_ATTR_BTREE_REF 1
#define XFS_DQUOT_REF 1
+#define XFS_REFC_BTREE_REF 1
/*
* Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 301ef2f4dbd6..b456cca1bfb2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -67,13 +67,14 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). The number of blocks reserved is based on the formula:
+ * (rmapbt). With reflink, there are four trees (refcountbt). The number of
+ * blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
-static uint
+uint
xfs_allocfree_log_count(
struct xfs_mount *mp,
uint num_ops)
@@ -83,6 +84,8 @@ xfs_allocfree_log_count(
blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
@@ -809,11 +812,18 @@ xfs_trans_resv_calc(
* require a permanent reservation on space.
*/
resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ else
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ resp->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ else
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -870,7 +880,10 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ else
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0eb46ed6d404..b7e5357d060a 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -87,6 +87,7 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -96,11 +97,13 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 41e0428d8175..7917f6e44286 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,6 +21,8 @@
/*
* Components of space reservations.
*/
+#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
+ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -28,6 +30,13 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+ XFS_EXTENTADD_SPACE_RES(mp,w) + \
+ ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+ (mp)->m_rmap_maxlevels)
#define XFS_DAENTER_1B(mp,w) \
((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3d503647f26b..8d74870468c2 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -90,6 +90,7 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
*/
#define XFS_DATA_FORK 0
#define XFS_ATTR_FORK 1
+#define XFS_COW_FORK 2
/*
* Min numbers of data/attr fork btree root pointers.
@@ -109,7 +110,7 @@ typedef enum {
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
- XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+ XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {