summaryrefslogtreecommitdiff
path: root/fs/xfs/libxfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c325
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h35
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c135
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h25
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c136
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h12
-rw-r--r--fs/xfs/libxfs/xfs_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_defer.c79
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h10
11 files changed, 655 insertions, 191 deletions
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000000..e3ae0f2b4294
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG. However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion. Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior: First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken. Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting. Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations. Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions. In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function. It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction. The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks. Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks? For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t avail;
+ xfs_extlen_t orig;
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+ orig = pag->pag_meta_resv.ar_asked;
+ break;
+ case XFS_AG_RESV_AGFL:
+ avail = pag->pagf_freeblks + pag->pagf_flcount -
+ pag->pag_meta_resv.ar_reserved;
+ orig = pag->pag_agfl_resv.ar_asked;
+ break;
+ default:
+ ASSERT(0);
+ return false;
+ }
+
+ trace_xfs_ag_resv_critical(pag, type, avail);
+
+ /* Critically low if less than 10% or max btree height remains. */
+ return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t len;
+
+ len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ len -= xfs_perag_resv(pag, type)->ar_reserved;
+ break;
+ case XFS_AG_RESV_NONE:
+ /* empty */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ trace_xfs_ag_resv_needed(pag, type, len);
+
+ return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t oldresv;
+ int error;
+
+ trace_xfs_ag_resv_free(pag, type, 0);
+
+ resv = xfs_perag_resv(pag, type);
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ /*
+ * AGFL blocks are always considered "free", so whatever
+ * was reserved at mount time must be given back at umount.
+ */
+ if (type == XFS_AG_RESV_AGFL)
+ oldresv = resv->ar_orig_reserved;
+ else
+ oldresv = resv->ar_reserved;
+ error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+ resv->ar_reserved = 0;
+ resv->ar_asked = 0;
+
+ if (error)
+ trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+ struct xfs_perag *pag)
+{
+ int error;
+ int err2;
+
+ error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+ err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+ if (err2 && !error)
+ error = err2;
+ return error;
+}
+
+static int
+__xfs_ag_resv_init(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ xfs_extlen_t ask,
+ xfs_extlen_t used)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_ag_resv *resv;
+ int error;
+
+ resv = xfs_perag_resv(pag, type);
+ if (used > ask)
+ ask = used;
+ resv->ar_asked = ask;
+ resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+ mp->m_ag_max_usable -= ask;
+
+ trace_xfs_ag_resv_init(pag, type, ask);
+
+ error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+ if (error)
+ trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+
+ return error;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+ struct xfs_perag *pag)
+{
+ xfs_extlen_t ask;
+ xfs_extlen_t used;
+ int error = 0;
+
+ /* Create the metadata reservation. */
+ if (pag->pag_meta_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error)
+ goto out;
+ }
+
+ /* Create the AGFL metadata reservation */
+ if (pag->pag_agfl_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+ if (error)
+ goto out;
+ }
+
+out:
+ return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t len;
+ uint field;
+
+ trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ /* fall through */
+ case XFS_AG_RESV_NONE:
+ field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+ XFS_TRANS_SB_FDBLOCKS;
+ xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+ return;
+ }
+
+ len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+ resv->ar_reserved -= len;
+ if (type == XFS_AG_RESV_AGFL)
+ return;
+ /* Allocations of reserved blocks only need on-disk sb updates... */
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+ /* ...but non-reserved blocks need in-core and on-disk updates. */
+ if (args->len > len)
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+ -((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_trans *tp,
+ xfs_extlen_t len)
+{
+ xfs_extlen_t leftover;
+ struct xfs_ag_resv *resv;
+
+ trace_xfs_ag_resv_free_extent(pag, type, len);
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ /* fall through */
+ case XFS_AG_RESV_NONE:
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ return;
+ }
+
+ leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+ resv->ar_reserved += leftover;
+ if (type == XFS_AG_RESV_AGFL)
+ return;
+ /* Freeing into the reserved pool only requires on-disk update... */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+ /* ...but freeing beyond that requires in-core and on-disk update. */
+ if (len > leftover)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000000..8d6c687deef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_AG_RESV_H__
+#define __XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+ enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_trans *tp, xfs_extlen_t len);
+
+#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 05b5243d89f6..ca75dc90ebe0 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_ag_resv.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -74,14 +75,8 @@ xfs_prealloc_blocks(
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
- * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
- * and 4 more to handle a potential split of the file's bmap btree.
- *
- * When rmap is enabled, we must also be able to handle two rmap btree inserts
- * to record both the file data extent and a new bmbt block. The bmbt block
- * might not be in the same AG as the file data extent. In the worst case
- * the bmap btree splits multiple levels and all the new blocks come from
- * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
+ * potential split of the file's bmap btree.
*/
unsigned int
xfs_alloc_set_aside(
@@ -90,8 +85,6 @@ xfs_alloc_set_aside(
unsigned int blocks;
blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
return blocks;
}
@@ -265,7 +258,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
- char userdata, /* are we allocating data? */
+ int datatype, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -276,6 +269,7 @@ xfs_alloc_compute_diff(
xfs_extlen_t newlen1=0; /* length with newbno1 */
xfs_extlen_t newlen2=0; /* length with newbno2 */
xfs_agblock_t wantend; /* end of target extent */
+ bool userdata = xfs_alloc_is_userdata(datatype);
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
@@ -680,12 +674,29 @@ xfs_alloc_ag_vextent(
xfs_alloc_arg_t *args) /* argument structure for allocation */
{
int error=0;
+ xfs_extlen_t reservation;
+ xfs_extlen_t oldmax;
ASSERT(args->minlen > 0);
ASSERT(args->maxlen > 0);
ASSERT(args->minlen <= args->maxlen);
ASSERT(args->mod < args->prod);
ASSERT(args->alignment > 0);
+
+ /*
+ * Clamp maxlen to the amount of free space minus any reservations
+ * that have been made.
+ */
+ oldmax = args->maxlen;
+ reservation = xfs_ag_resv_needed(args->pag, args->resv);
+ if (args->maxlen > args->pag->pagf_freeblks - reservation)
+ args->maxlen = args->pag->pagf_freeblks - reservation;
+ if (args->maxlen == 0) {
+ args->agbno = NULLAGBLOCK;
+ args->maxlen = oldmax;
+ return 0;
+ }
+
/*
* Branch to correct routine based on the type.
*/
@@ -705,12 +716,14 @@ xfs_alloc_ag_vextent(
/* NOTREACHED */
}
+ args->maxlen = oldmax;
+
if (error || args->agbno == NULLAGBLOCK)
return error;
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
- ASSERT(!args->wasfromfl || !args->isfl);
+ ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
ASSERT(args->agbno % args->alignment == 0);
/* if not file data, insert new block into the reverse map btree */
@@ -732,12 +745,7 @@ xfs_alloc_ag_vextent(
args->agbno, args->len));
}
- if (!args->isfl) {
- xfs_trans_mod_sb(args->tp, args->wasdel ?
- XFS_TRANS_SB_RES_FDBLOCKS :
- XFS_TRANS_SB_FDBLOCKS,
- -((long)(args->len)));
- }
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
XFS_STATS_INC(args->mp, xs_allocx);
XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -917,7 +925,7 @@ xfs_alloc_find_best_extent(
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment,
- args->userdata, *sbnoa,
+ args->datatype, *sbnoa,
*slena, &new);
/*
@@ -1101,7 +1109,7 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->userdata, ltbnoa,
+ args->alignment, args->datatype, ltbnoa,
ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
@@ -1254,7 +1262,7 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->userdata, ltbnoa,
+ args->alignment, args->datatype, ltbnoa,
ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
@@ -1271,7 +1279,7 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->userdata, gtbnoa,
+ args->alignment, args->datatype, gtbnoa,
gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
@@ -1331,7 +1339,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- args->userdata, ltbnoa, ltlena, &ltnew);
+ args->datatype, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1583,6 +1591,7 @@ xfs_alloc_ag_vextent_small(
int *stat) /* status: 0-freelist, 1-normal/none */
{
struct xfs_owner_info oinfo;
+ struct xfs_perag *pag;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1600,7 +1609,8 @@ xfs_alloc_ag_vextent_small(
* to respect minleft even when pulling from the
* freelist.
*/
- else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+ else if (args->minlen == 1 && args->alignment == 1 &&
+ args->resv != XFS_AG_RESV_AGFL &&
(be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
> args->minleft)) {
error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1608,9 +1618,9 @@ xfs_alloc_ag_vextent_small(
goto error0;
if (fbno != NULLAGBLOCK) {
xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- args->userdata);
+ xfs_alloc_allow_busy_reuse(args->datatype));
- if (args->userdata) {
+ if (xfs_alloc_is_userdata(args->datatype)) {
xfs_buf_t *bp;
bp = xfs_btree_get_bufs(args->mp, args->tp,
@@ -1629,13 +1639,18 @@ xfs_alloc_ag_vextent_small(
/*
* If we're feeding an AGFL block to something that
* doesn't live in the free space, we need to clear
- * out the OWN_AG rmap.
+ * out the OWN_AG rmap and add the block back to
+ * the AGFL per-AG reservation.
*/
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_free(args->tp, args->agbp, args->agno,
fbno, 1, &oinfo);
if (error)
goto error0;
+ pag = xfs_perag_get(args->mp, args->agno);
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
+ args->tp, 1);
+ xfs_perag_put(pag);
*stat = 0;
return 0;
@@ -1683,7 +1698,7 @@ xfs_free_ag_extent(
xfs_agblock_t bno,
xfs_extlen_t len,
struct xfs_owner_info *oinfo,
- int isfl)
+ enum xfs_ag_resv_type type)
{
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
@@ -1911,21 +1926,22 @@ xfs_free_ag_extent(
*/
pag = xfs_perag_get(mp, agno);
error = xfs_alloc_update_counters(tp, pag, agbp, len);
+ xfs_ag_resv_free_extent(pag, type, tp, len);
xfs_perag_put(pag);
if (error)
goto error0;
- if (!isfl)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+ trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+ haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+ trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+ -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -1950,21 +1966,43 @@ xfs_alloc_compute_maxlevels(
}
/*
- * Find the length of the longest extent in an AG.
+ * Find the length of the longest extent in an AG. The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
*/
xfs_extlen_t
xfs_alloc_longest_free_extent(
struct xfs_mount *mp,
struct xfs_perag *pag,
- xfs_extlen_t need)
+ xfs_extlen_t need,
+ xfs_extlen_t reserved)
{
xfs_extlen_t delta = 0;
+ /*
+ * If the AGFL needs a recharge, we'll have to subtract that from the
+ * longest extent.
+ */
if (need > pag->pagf_flcount)
delta = need - pag->pagf_flcount;
+ /*
+ * If we cannot maintain others' reservations with space from the
+ * not-longest freesp extents, we'll have to subtract /that/ from
+ * the longest extent too.
+ */
+ if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+ delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+ /*
+ * If the longest extent is long enough to satisfy all the
+ * reservations and AGFL rules in place, we can return this extent.
+ */
if (pag->pagf_longest > delta)
return pag->pagf_longest - delta;
+
+ /* Otherwise, let the caller try for 1 block if there's space. */
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
@@ -2004,20 +2042,24 @@ xfs_alloc_space_available(
{
struct xfs_perag *pag = args->pag;
xfs_extlen_t longest;
+ xfs_extlen_t reservation; /* blocks that are still reserved */
int available;
if (flags & XFS_ALLOC_FLAG_FREEING)
return true;
+ reservation = xfs_ag_resv_needed(pag, args->resv);
+
/* do we have enough contiguous free space for the allocation? */
- longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+ longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
+ reservation);
if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
return false;
- /* do have enough free space remaining for the allocation? */
+ /* do we have enough free space remaining for the allocation? */
available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
- min_free - args->total);
- if (available < (int)args->minleft)
+ reservation - min_free - args->total);
+ if (available < (int)args->minleft || available <= 0)
return false;
return true;
@@ -2058,7 +2100,7 @@ xfs_alloc_fix_freelist(
* somewhere else if we are not being asked to try harder at this
* point
*/
- if (pag->pagf_metadata && args->userdata &&
+ if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
@@ -2124,7 +2166,7 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
- &targs.oinfo, 1);
+ &targs.oinfo, XFS_AG_RESV_AGFL);
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2135,7 +2177,7 @@ xfs_alloc_fix_freelist(
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+ targs.alignment = targs.minlen = targs.prod = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2146,6 +2188,7 @@ xfs_alloc_fix_freelist(
while (pag->pagf_flcount < need) {
targs.agbno = 0;
targs.maxlen = need - pag->pagf_flcount;
+ targs.resv = XFS_AG_RESV_AGFL;
/* Allocate as many blocks as possible at once. */
error = xfs_alloc_ag_vextent(&targs);
@@ -2633,7 +2676,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2766,7 +2809,7 @@ xfs_alloc_vextent(
#endif
/* Zero the extent if we were asked to do so */
- if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(args->ip, args->fsbno, args->len);
if (error)
goto error0;
@@ -2825,7 +2868,8 @@ xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo) /* extent owner */
+ struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type) /* block reservation type */
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
@@ -2834,6 +2878,7 @@ xfs_free_extent(
int error;
ASSERT(len != 0);
+ ASSERT(type != XFS_AG_RESV_AGFL);
if (XFS_TEST_ERROR(false, mp,
XFS_ERRTAG_FREE_EXTENT,
@@ -2851,7 +2896,7 @@ xfs_free_extent(
agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
err);
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
goto err;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6fe2d6b7cfe9..7c404a6b0ae3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {
xfs_extlen_t len; /* output: actual size of extent */
xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
xfs_alloctype_t otype; /* original allocation type */
+ int datatype; /* mask defining data type treatment */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
- char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
+ enum xfs_ag_resv_type resv; /* block reservation to use */
} xfs_alloc_arg_t;
/*
- * Defines for userdata
+ * Defines for datatype
*/
#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
+
+static inline bool
+xfs_alloc_is_userdata(int datatype)
+{
+ return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
+}
+
+static inline bool
+xfs_alloc_allow_busy_reuse(int datatype)
+{
+ return (datatype & XFS_ALLOC_NOBUSY) == 0;
+}
/* freespace limit calculations */
#define XFS_ALLOC_AGFL_RESERVE 4
@@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
- struct xfs_perag *pag, xfs_extlen_t need);
+ struct xfs_perag *pag, xfs_extlen_t need,
+ xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
@@ -184,7 +198,8 @@ xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo);/* extent owner */
+ struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type); /* block reservation type */
int /* error */
xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b060bca93402..9d7f61d36645 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,7 @@
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents(
* Else, *lastxp will be set to the index of the found
* entry; *gotp will contain the entry.
*/
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmbt_rec_host_t * /* pointer to found extent entry */
xfs_bmap_search_extents(
xfs_inode_t *ip, /* incore inode pointer */
xfs_fileoff_t bno, /* block number searched for */
@@ -3347,7 +3348,8 @@ xfs_bmap_adjacent(
mp = ap->ip->i_mount;
nullfb = *ap->firstblock == NULLFSBLOCK;
- rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+ rt = XFS_IS_REALTIME_INODE(ap->ip) &&
+ xfs_alloc_is_userdata(ap->datatype);
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
/*
* If allocating at eof, and there's a previous real block,
@@ -3501,7 +3503,8 @@ xfs_bmap_longest_free_extent(
}
longest = xfs_alloc_longest_free_extent(mp, pag,
- xfs_alloc_min_freelist(mp, pag));
+ xfs_alloc_min_freelist(mp, pag),
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
*blen = longest;
@@ -3622,7 +3625,7 @@ xfs_bmap_btalloc(
{
xfs_mount_t *mp; /* mount point structure */
xfs_alloctype_t atype = 0; /* type for allocation routines */
- xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t align = 0; /* minimum allocation alignment */
xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
xfs_agnumber_t ag;
xfs_alloc_arg_t args;
@@ -3645,7 +3648,8 @@ xfs_bmap_btalloc(
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
- align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+ if (xfs_alloc_is_userdata(ap->datatype))
+ align = xfs_get_extsz_hint(ap->ip);
if (unlikely(align)) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
@@ -3658,7 +3662,8 @@ xfs_bmap_btalloc(
nullfb = *ap->firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
if (nullfb) {
- if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+ if (xfs_alloc_is_userdata(ap->datatype) &&
+ xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
@@ -3698,7 +3703,8 @@ xfs_bmap_btalloc(
* enough for the request. If one isn't found, then adjust
* the minimum allocation size to the largest space found.
*/
- if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ if (xfs_alloc_is_userdata(ap->datatype) &&
+ xfs_inode_is_filestream(ap->ip))
error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
else
error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
@@ -3781,9 +3787,9 @@ xfs_bmap_btalloc(
}
args.minleft = ap->minleft;
args.wasdel = ap->wasdel;
- args.isfl = 0;
- args.userdata = ap->userdata;
- if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+ if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
@@ -3877,7 +3883,8 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+ if (XFS_IS_REALTIME_INODE(ap->ip) &&
+ xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
return xfs_bmap_btalloc(ap);
}
@@ -4074,7 +4081,7 @@ xfs_bmapi_read(
return 0;
}
-STATIC int
+int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
xfs_fileoff_t aoff,
@@ -4170,91 +4177,6 @@ out_unreserve_quota:
return error;
}
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- int flags) /* XFS_BMAPI_... */
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- struct xfs_bmbt_irec got; /* current file extent record */
- struct xfs_bmbt_irec prev; /* previous file extent record */
- xfs_fileoff_t obno; /* old block number (offset) */
- xfs_fileoff_t end; /* end of mapped file region */
- xfs_extnum_t lastx; /* last useful extent number */
- int eof; /* we've hit the end of extents */
- int n = 0; /* current extent index */
- int error = 0;
-
- ASSERT(*nmap >= 1);
- ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- XFS_STATS_INC(mp, xs_blk_mapw);
-
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- return error;
- }
-
- xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
- end = bno + len;
- obno = bno;
-
- while (bno < end && n < *nmap) {
- if (eof || got.br_startoff > bno) {
- error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
- &prev, &lastx, eof);
- if (error) {
- if (n == 0) {
- *nmap = 0;
- return error;
- }
- break;
- }
- }
-
- /* set up the extent map to return. */
- xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
- xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
- /* If we're done, stop now. */
- if (bno >= end || n >= *nmap)
- break;
-
- /* Else go on to the next record. */
- prev = got;
- if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
- else
- eof = 1;
- }
-
- *nmap = n;
- return 0;
-}
-
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
@@ -4287,15 +4209,21 @@ xfs_bmapi_allocate(
}
/*
- * Indicate if this is the first user data in the file, or just any
- * user data. And if it is userdata, indicate whether it needs to
- * be initialised to zero during allocation.
+ * Set the data type being allocated. For the data fork, the first data
+ * in the file is treated differently to all other allocations. For the
+ * attribute fork, we only need to ensure the allocated range is not on
+ * the busy list.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
- bma->userdata = (bma->offset == 0) ?
- XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ bma->datatype = XFS_ALLOC_NOBUSY;
+ if (whichfork == XFS_DATA_FORK) {
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+ else
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ }
if (bma->flags & XFS_BMAPI_ZERO)
- bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
+ bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4565,7 +4493,7 @@ xfs_bmapi_write(
bma.tp = tp;
bma.ip = ip;
bma.total = total;
- bma.userdata = 0;
+ bma.datatype = 0;
bma.dfops = dfops;
bma.firstblock = firstblock;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 254034f96941..8395f6e8cf7d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -54,7 +54,7 @@ struct xfs_bmalloca {
bool wasdel; /* replacing a delayed allocation */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
- char userdata;/* userdata mask */
+ int datatype;/* data type being allocated */
int flags;
};
@@ -181,9 +181,6 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
int *nmap, int flags);
-int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
- xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_fsblock_t *firstblock, xfs_extlen_t total,
@@ -202,5 +199,12 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+struct xfs_bmbt_rec_host *
+ xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
+ int fork, int *eofp, xfs_extnum_t *lastxp,
+ struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
+int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
+ xfs_filblks_t len, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 08569792fe20..aa1752f918b8 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2070,7 +2070,7 @@ __xfs_btree_updkeys(
struct xfs_buf *bp0,
bool force_all)
{
- union xfs_btree_bigkey key; /* keys from current level */
+ union xfs_btree_key key; /* keys from current level */
union xfs_btree_key *lkey; /* keys from the next level up */
union xfs_btree_key *hkey;
union xfs_btree_key *nlkey; /* keys from the next level up */
@@ -2086,7 +2086,7 @@ __xfs_btree_updkeys(
trace_xfs_btree_updkeys(cur, level, bp0);
- lkey = (union xfs_btree_key *)&key;
+ lkey = &key;
hkey = xfs_btree_high_key_from_key(cur, lkey);
xfs_btree_get_keys(cur, block, lkey);
for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3226,7 @@ xfs_btree_insrec(
struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */
struct xfs_btree_cur *ncur; /* new btree cursor */
- union xfs_btree_bigkey nkey; /* new block key */
+ union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey;
int optr; /* old key/record index */
int ptr; /* key/record index */
@@ -3241,7 +3241,7 @@ xfs_btree_insrec(
XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
ncur = NULL;
- lkey = (union xfs_btree_key *)&nkey;
+ lkey = &nkey;
/*
* If we have an external root pointer, and we've made it to the
@@ -3444,14 +3444,14 @@ xfs_btree_insert(
union xfs_btree_ptr nptr; /* new block number (split result) */
struct xfs_btree_cur *ncur; /* new cursor (split result) */
struct xfs_btree_cur *pcur; /* previous level's cursor */
- union xfs_btree_bigkey bkey; /* key of block to insert */
+ union xfs_btree_key bkey; /* key of block to insert */
union xfs_btree_key *key;
union xfs_btree_rec rec; /* record to insert */
level = 0;
ncur = NULL;
pcur = cur;
- key = (union xfs_btree_key *)&bkey;
+ key = &bkey;
xfs_btree_set_ptr_null(cur, &nptr);
@@ -4797,3 +4797,50 @@ xfs_btree_query_range(
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
fn, priv);
}
+
+/*
+ * Calculate the number of blocks needed to store a given number of records
+ * in a short-format (per-AG metadata) btree.
+ */
+xfs_extlen_t
+xfs_btree_calc_size(
+ struct xfs_mount *mp,
+ uint *limits,
+ unsigned long long len)
+{
+ int level;
+ int maxrecs;
+ xfs_extlen_t rval;
+
+ maxrecs = limits[0];
+ for (level = 0, rval = 0; len > 1; level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ maxrecs = limits[1];
+ rval += len;
+ }
+ return rval;
+}
+
+int
+xfs_btree_count_blocks_helper(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *data)
+{
+ xfs_extlen_t *blocks = data;
+ (*blocks)++;
+
+ return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_extlen_t *blocks)
+{
+ *blocks = 0;
+ return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+ blocks);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 04d0865e5e6d..3f8556a5c2ad 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,18 @@ union xfs_btree_ptr {
__be64 l; /* long form ptr */
};
-union xfs_btree_key {
- struct xfs_bmbt_key bmbt;
- xfs_bmdr_key_t bmbr; /* bmbt root block */
- xfs_alloc_key_t alloc;
- struct xfs_inobt_key inobt;
- struct xfs_rmap_key rmap;
-};
-
/*
- * In-core key that holds both low and high keys for overlapped btrees.
- * The two keys are packed next to each other on disk, so do the same
- * in memory. Preserve the existing xfs_btree_key as a single key to
- * avoid the mental model breakage that would happen if we passed a
- * bigkey into a function that operates on a single key.
+ * The in-core btree key. Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both. The __*bigkey
+ * items should never be accessed directly.
*/
-union xfs_btree_bigkey {
+union xfs_btree_key {
struct xfs_bmbt_key bmbt;
xfs_bmdr_key_t bmbr; /* bmbt root block */
xfs_alloc_key_t alloc;
struct xfs_inobt_key inobt;
- struct {
- struct xfs_rmap_key rmap;
- struct xfs_rmap_key rmap_hi;
- };
+ struct xfs_rmap_key rmap;
+ struct xfs_rmap_key __rmap_bigkey[2];
};
union xfs_btree_rec {
@@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
unsigned long len);
+xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
+ unsigned long long len);
/* return codes */
#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
@@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn, void *data);
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c221d0ecd52e..613c5cf19436 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
* - For each work item attached to the log intent item,
* * Perform the described action.
* * Attach the work item to the log done item.
+ * * If the result of doing the work was -EAGAIN, ->finish work
+ * wants a new transaction. See the "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" section below for
+ * details.
*
* The key here is that we must log an intent item for all pending
* work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
* we can perform complex remapping operations, chaining intent items
* as needed.
*
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation. The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off. The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish. Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work. Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue. Then it rolls the
+ * transaction and picks up processing where it left off. It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
* This is an example of remapping the extent (E, E+B) into file X at
* offset A and dealing with the extent (C, C+B) already being mapped
* there:
@@ -104,21 +136,26 @@
* | Intent to add rmap (X, E, A, B) |
* +-------------------------------------------------+
* | Reduce refcount for extent (C, B) | t2
- * | Done reducing refcount for extent (C, B) |
+ * | Done reducing refcount for extent (C, 9) |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9) | t3
+ * | Done reducing refcount for extent (C+9, B-9) |
* | Increase refcount for extent (E, B) |
* | Done increasing refcount for extent (E, B) |
* | Intent to free extent (C, B) |
* | Intent to free extent (F, 1) (refcountbt block) |
* | Intent to remove rmap (F, 1, REFC) |
* +-------------------------------------------------+
- * | Remove rmap (X, C, A, B) | t3
+ * | Remove rmap (X, C, A, B) | t4
* | Done removing rmap (X, C, A, B) |
* | Add rmap (X, E, A, B) |
* | Done adding rmap (X, E, A, B) |
* | Remove rmap (F, 1, REFC) |
* | Done removing rmap (F, 1, REFC) |
* +-------------------------------------------------+
- * | Free extent (C, B) | t4
+ * | Free extent (C, B) | t5
* | Done freeing extent (C, B) |
* | Free extent (D, 1) |
* | Done freeing extent (D, 1) |
@@ -141,6 +178,9 @@
* - Intent to free extent (C, B)
* - Intent to free extent (F, 1) (refcountbt block)
* - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
*/
static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
dfp->dfp_count--;
error = dfp->dfp_type->finish_item(*tp, dop, li,
dfp->dfp_done, &state);
- if (error) {
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction;
+ * put the work item back on the list
+ * and jump out.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ break;
+ } else if (error) {
/*
* Clean up after ourselves and jump out.
* xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
goto out;
}
}
- /* Done with the dfp, free it. */
- list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction, so log a
+ * new log intent item to replace the old one
+ * and roll the transaction. See "Requesting
+ * a Fresh Transaction while Finishing
+ * Deferred Work" above.
+ */
+ dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+ dfp->dfp_count);
+ dfp->dfp_done = NULL;
+ list_for_each(li, &dfp->dfp_work)
+ dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
+ li);
+ } else {
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+ }
if (cleanup_fn)
cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 31ca2208c03d..eab68ae2e011 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
- &oinfo);
+ &oinfo, XFS_AG_RESV_NONE);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a6eed43fa7cd..fc5eef85d61e 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -647,9 +647,17 @@ struct xfs_rui_log_format {
__uint16_t rui_size; /* size of this item */
__uint32_t rui_nextents; /* # extents to free */
__uint64_t rui_id; /* rui identifier */
- struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */
+ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */
};
+static inline size_t
+xfs_rui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_rui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
/*
* This is the structure used to lay out an rud log item in the
* log. The rud_extents array is a variable size array whose