diff options
author | Jiri Kosina <jkosina@suse.cz> | 2020-09-01 15:19:48 +0300 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2020-09-01 15:19:48 +0300 |
commit | ead5d1f4d877e92c051e1a1ade623d0d30e71619 (patch) | |
tree | cb9db5698a546e7b96f7d5bef5ce544629dd37a2 /fs/xfs | |
parent | f53fa968a7344970b8f8a5707c39cdcf17a6f367 (diff) | |
parent | b51594df17d0ce80b9f9f35394a1f42d7ac94472 (diff) | |
download | linux-ead5d1f4d877e92c051e1a1ade623d0d30e71619.tar.xz |
Merge branch 'master' into for-next
Sync with Linus' branch in order to be able to apply fixups
of more recent patches.
Diffstat (limited to 'fs/xfs')
166 files changed, 11117 insertions, 10226 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index aceca2f9a3db..04611a1068b4 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -7,8 +7,6 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events ccflags-y += -I $(srctree)/$(src)/libxfs -ccflags-$(CONFIG_XFS_DEBUG) += -g - obj-$(CONFIG_XFS_FS) += xfs.o # this one should be compiled first, as the tracing macros can easily blow up @@ -26,6 +24,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ + xfs_btree_staging.o \ xfs_da_btree.o \ xfs_defer.o \ xfs_dir2.o \ @@ -100,9 +99,12 @@ xfs-y += xfs_log.o \ xfs_log_cil.o \ xfs_bmap_item.o \ xfs_buf_item.o \ + xfs_buf_item_recover.o \ + xfs_dquot_item_recover.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_inode_item_recover.o \ xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..e841ed781a25 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); @@ -115,24 +115,3 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - -void * -kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); - do { - ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", - current->comm, current->pid, - __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 6143117770e9..8e8555817e6d 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -19,6 +19,7 @@ typedef unsigned __bitwise xfs_km_flags_t; #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) +#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u) /* * We use a special process flag to avoid recursive callbacks into @@ -30,7 +31,7 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); + BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP)); lflags = GFP_KERNEL | __GFP_NOWARN; if (flags & KM_NOFS) @@ -49,6 +50,9 @@ kmem_flags_convert(xfs_km_flags_t flags) if (flags & KM_ZERO) lflags |= __GFP_ZERO; + if (flags & KM_NOLOCKDEP) + lflags |= __GFP_NOLOCKDEP; + return lflags; } @@ -81,14 +85,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache -extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); - -static inline void * -kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - return kmem_zone_alloc(zone, flags | KM_ZERO); -} - static inline struct page * kmem_to_page(void *addr) { diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 08d6beb54f8c..8cf73fe4338e 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -231,7 +231,7 @@ xfs_sbblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; @@ -243,7 +243,7 @@ xfs_agfblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -301,7 +301,7 @@ xfs_agflblock_init( uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } @@ -312,7 +312,7 @@ xfs_agiblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -502,7 +502,7 @@ xfs_ag_extend_space( if (error) return error; - agi = XFS_BUF_TO_AGI(bp); + agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); @@ -515,7 +515,7 @@ xfs_ag_extend_space( if (error) return error; - agf = XFS_BUF_TO_AGF(bp); + agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); @@ -563,17 +563,18 @@ xfs_ag_get_geometry( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); if (error) goto out_agi; - pag = xfs_perag_get(mp, agno); + + pag = agi_bp->b_pag; /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); ageo->ag_number = agno; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + @@ -583,7 +584,6 @@ xfs_ag_get_geometry( xfs_ag_geom_health(pag, ageo); /* Release resources. */ - xfs_perag_put(pag); xfs_buf_relse(agf_bp); out_agi: xfs_buf_relse(agi_bp); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index c0352edc8e41..8a8eb4bc48bb 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> @@ -37,16 +37,4 @@ xfs_ag_resv_rmapbt_alloc( xfs_perag_put(pag); } -static inline void -xfs_ag_resv_rmapbt_free( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d8053bc96c4d..852b536551b5 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -151,7 +151,7 @@ xfs_alloc_lookup_eq( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -171,7 +171,7 @@ xfs_alloc_lookup_ge( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -190,7 +190,7 @@ xfs_alloc_lookup_le( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -198,7 +198,7 @@ static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_private.a.priv.abt.active; + return cur && cur->bc_ag.abt.active; } /* @@ -230,7 +230,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -589,6 +589,7 @@ xfs_agfl_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; /* @@ -614,8 +615,8 @@ xfs_agfl_verify( return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && - be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; } @@ -709,19 +710,18 @@ xfs_alloc_read_agfl( STATIC int xfs_alloc_update_counters( struct xfs_trans *tp, - struct xfs_perag *pag, struct xfs_buf *agbp, long len) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; - pag->pagf_freeblks += len; + agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { - xfs_buf_corruption_error(agbp); + xfs_buf_mark_corrupt(agbp); return -EFSCORRUPTED; } @@ -907,7 +907,7 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, *new); return 0; @@ -922,13 +922,13 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1026,6 +1026,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_agf *agf = args->agbp->b_addr; int error = 0; xfs_agblock_t fbno = NULLAGBLOCK; xfs_extlen_t flen = 0; @@ -1054,8 +1055,7 @@ xfs_alloc_ag_vextent_small( if (args->minlen != 1 || args->alignment != 1 || args->resv == XFS_AG_RESV_AGFL || - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= - args->minleft)) + be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1079,9 +1079,7 @@ xfs_alloc_ag_vextent_small( } *fbnop = args->agbno = fbno; *flenp = args->len = 1; - if (XFS_IS_CORRUPT(args->mp, - fbno >= be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error; } @@ -1176,8 +1174,7 @@ xfs_alloc_ag_vextent( } if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->pag, - args->agbp, + error = xfs_alloc_update_counters(args->tp, args->agbp, -((long)(args->len))); if (error) return error; @@ -1203,6 +1200,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ int error; @@ -1281,8 +1279,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -1353,7 +1350,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; if (count > 0) count--; @@ -1468,7 +1465,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_private.a.priv.abt.active = true; + acur->cnt->bc_ag.abt.active = true; fbcur = acur->cnt; fbinc = false; } @@ -1515,7 +1512,7 @@ xfs_alloc_ag_vextent_lastblock( * maxlen, go to the start of this block, and skip all those smaller * than minlen. */ - if (len || args->alignment > 1) { + if (*len || args->alignment > 1) { acur->cnt->bc_ptrs[0] = 1; do { error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); @@ -1661,6 +1658,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ int error; /* error result */ @@ -1851,8 +1849,7 @@ restart: args->agbno = rbno; if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > - be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error0; } @@ -1888,7 +1885,6 @@ xfs_free_ag_extent( enum xfs_ag_resv_type type) { struct xfs_mount *mp; - struct xfs_perag *pag; struct xfs_btree_cur *bno_cur; struct xfs_btree_cur *cnt_cur; xfs_agblock_t gtbno; /* start of right neighbor */ @@ -2168,10 +2164,8 @@ xfs_free_ag_extent( /* * Update the freespace totals in the ag and superblock. */ - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_update_counters(tp, pag, agbp, len); - xfs_ag_resv_free_extent(pag, type, tp, len); - xfs_perag_put(pag); + error = xfs_alloc_update_counters(tp, agbp, len); + xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); if (error) goto error0; @@ -2424,7 +2418,7 @@ xfs_agfl_reset( struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; ASSERT(pag->pagf_agflreset); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); @@ -2468,7 +2462,8 @@ xfs_defer_agfl_block( ASSERT(xfs_bmap_free_item_zone != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_alloc(xfs_bmap_free_item_zone, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; @@ -2655,7 +2650,7 @@ xfs_alloc_get_freelist( xfs_agblock_t *bnop, /* block address retrieved from freelist */ int btreeblk) /* destination is a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_agf *agf = agbp->b_addr; xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ __be32 *agfl_bno; @@ -2667,7 +2662,6 @@ xfs_alloc_get_freelist( /* * Freelist is empty, give up. */ - agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2684,14 +2678,14 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); @@ -2703,7 +2697,6 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2745,7 +2738,7 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; - trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); @@ -2783,18 +2776,15 @@ xfs_alloc_put_freelist( xfs_agblock_t bno, /* block being freed */ int btreeblk) /* block came from a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = agbp->b_addr; __be32 *blockp;/* pointer to array entry */ int error; int logflags; - xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ __be32 *agfl_bno; int startoff; - agf = XFS_BUF_TO_AGF(agbp); - mp = tp->t_mountp; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; @@ -2802,7 +2792,7 @@ xfs_alloc_put_freelist( if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); @@ -2814,13 +2804,12 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); startoff = (char *)blockp - (char *)agflbp->b_addr; @@ -2838,13 +2827,12 @@ xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) return __this_address; } @@ -2858,6 +2846,13 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; + if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + return __this_address; + + if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || + be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || @@ -2869,6 +2864,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return __this_address; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + return __this_address; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2883,6 +2882,11 @@ xfs_agf_verify( return __this_address; if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_blocks) > + be32_to_cpu(agf->agf_length)) + return __this_address; + + if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return __this_address; @@ -2914,6 +2918,7 @@ xfs_agf_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agf *agf = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agf_verify(bp); @@ -2926,7 +2931,7 @@ xfs_agf_write_verify( return; if (bip) - XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } @@ -2994,8 +2999,8 @@ xfs_alloc_read_agf( return error; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - pag = xfs_perag_get(mp, agno); + agf = (*bpp)->b_addr; + pag = (*bpp)->b_pag; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3023,7 +3028,6 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif - xfs_perag_put(pag); return 0; } @@ -3275,6 +3279,7 @@ __xfs_free_extent( struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + struct xfs_agf *agf; int error; unsigned int busy_flags = 0; @@ -3288,6 +3293,7 @@ __xfs_free_extent( error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { error = -EFSCORRUPTED; @@ -3295,9 +3301,7 @@ __xfs_free_extent( } /* validate the extent size is legal now we have the agf locked */ - if (XFS_IS_CORRUPT(mp, - agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) { + if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto err; } @@ -3408,7 +3412,7 @@ xfs_agfl_walk( unsigned int i; int error; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); i = be32_to_cpu(agf->agf_flfirst); /* Nothing to walk in an empty AGFL. */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7380fbe4a3ff..6c22b12176b8 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -236,4 +236,13 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); +static inline __be32 * +xfs_buf_to_agfl_bno( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + return bp->b_addr + sizeof(struct xfs_agfl); + return bp->b_addr; +} + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 279694d73e4e..8e01231b308e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -25,7 +26,7 @@ xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -35,18 +36,16 @@ xfs_allocbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -62,7 +61,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -72,7 +71,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -86,8 +85,8 @@ xfs_allocbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; @@ -113,8 +112,7 @@ xfs_allocbt_update_lastrec( int ptr, int reason) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; struct xfs_perag *pag; __be32 len; int numrecs; @@ -159,10 +157,9 @@ xfs_allocbt_update_lastrec( } agf->agf_longest = len; - pag = xfs_perag_get(cur->bc_mp, seqno); + pag = cur->bc_ag.agbp->b_pag; pag->pagf_longest = be32_to_cpu(len); - xfs_perag_put(pag); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } STATIC int @@ -226,9 +223,9 @@ xfs_allocbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -471,23 +468,19 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .recs_inorder = xfs_cntbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_btnum_t btnum) /* btree identifier */ +/* Allocate most of a new allocation btree cursor. */ +STATIC struct xfs_btree_cur * +xfs_allocbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; @@ -495,19 +488,16 @@ xfs_allocbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; if (btnum == XFS_BTNUM_CNT) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_ops = &xfs_cntbt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); cur->bc_ops = &xfs_bnobt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.agno = agno; + cur->bc_ag.abt.active = false; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -516,6 +506,73 @@ xfs_allocbt_init_cursor( } /* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_CNT) + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + else + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + + cur->bc_ag.agbp = agbp; + + return cur; +} + +/* Create a free space btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_allocbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new free space btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_allocbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + } else { + cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + } +} + +/* * Calculate number of records in an alloc btree block. */ int diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index c9305ebb69f6..a5b998e950fe 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size depends on a superblock flag. @@ -48,8 +49,14 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); +void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e6149720ce02..2e055c079f39 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -46,6 +46,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); /* * Internal routines when attribute list is more than one block. @@ -53,43 +54,18 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, + struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); - -STATIC int -xfs_attr_args_init( - struct xfs_da_args *args, - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) -{ - - if (!name) - return -EINVAL; - - memset(args, 0, sizeof(*args)); - args->geo = dp->i_mount->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->dp = dp; - args->flags = flags; - args->name = name; - args->namelen = namelen; - if (args->namelen >= MAXNAMELEN) - return -EFAULT; /* match IRIX behaviour */ - - args->hashval = xfs_da_hashname(args->name, args->namelen); - return 0; -} - int xfs_inode_hasattr( struct xfs_inode *ip) { if (!XFS_IFORK_Q(ip) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0)) return 0; return 1; } @@ -104,85 +80,60 @@ xfs_inode_hasattr( */ int xfs_attr_get_ilocked( - struct xfs_inode *ip, struct xfs_da_args *args) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - if (!xfs_inode_hasattr(ip)) + if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + + if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) return xfs_attr_leaf_get(args); - else - return xfs_attr_node_get(args); + return xfs_attr_node_get(args); } /* * Retrieve an extended attribute by name, and its value if requested. * - * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, - * just an indication whether the attribute exists and the size of the value if - * it exists. The size is returned in @valuelenp, + * If args->valuelen is zero, then the caller does not want the value, just an + * indication whether the attribute exists and the size of the value if it + * exists. The size is returned in args.valuelen. * - * If the attribute is found, but exceeds the size limit set by the caller in - * @valuelenp, return -ERANGE with the size of the attribute that was found in - * @valuelenp. + * If args->value is NULL but args->valuelen is non-zero, allocate the buffer + * for the value after existence of the attribute has been determined. The + * caller always has to free args->value if it is set, no matter if this + * function was successful or not. * - * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after - * existence of the attribute has been determined. On success, return that - * buffer to the caller and leave them to free it. On failure, free any - * allocated buffer and ensure the buffer pointer returned to the caller is - * null. + * If the attribute is found, but exceeds the size limit set by the caller in + * args->valuelen, return -ERANGE with the size of the attribute that was found + * in args->valuelen. */ int xfs_attr_get( - struct xfs_inode *ip, - const unsigned char *name, - size_t namelen, - unsigned char **value, - int *valuelenp, - int flags) + struct xfs_da_args *args) { - struct xfs_da_args args; uint lock_mode; int error; - ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); - - XFS_STATS_INC(ip->i_mount, xs_attr_get); + XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(args->dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, namelen, flags); - if (error) - return error; + args->geo = args->dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* Entirely possible to look up a name which doesn't exist */ - args.op_flags = XFS_DA_OP_OKNOENT; - if (flags & ATTR_ALLOC) - args.op_flags |= XFS_DA_OP_ALLOCVAL; - else - args.value = *value; - args.valuelen = *valuelenp; + args->op_flags = XFS_DA_OP_OKNOENT; - lock_mode = xfs_ilock_attr_map_shared(ip); - error = xfs_attr_get_ilocked(ip, &args); - xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; + lock_mode = xfs_ilock_attr_map_shared(args->dp); + error = xfs_attr_get_ilocked(args); + xfs_iunlock(args->dp, lock_mode); - /* on error, we have to clean up allocated value buffers */ - if (error) { - if (flags & ATTR_ALLOC) { - kmem_free(args.value); - *value = NULL; - } - return error; - } - *value = args.value; - return 0; + return error; } /* @@ -227,8 +178,13 @@ xfs_attr_try_sf_addname( struct xfs_da_args *args) { - struct xfs_mount *mp = dp->i_mount; - int error, error2; + int error; + + /* + * Build initial attribute list (if required). + */ + if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); if (error == -ENOSPC) @@ -238,15 +194,73 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && (args->flags & ATTR_KERNOTIME) == 0) + if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args->trans); - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; + return error; +} + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +/* + * Attempts to set an attr in shortform, or converts short form to leaf form if + * there is not enough room. If the attr is set, the transaction is committed + * and set to NULL. + */ +STATIC int +xfs_attr_set_shortform( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) +{ + struct xfs_inode *dp = args->dp; + int error, error2 = 0; + + /* + * Try to add the attr to the attribute list in the inode. + */ + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; + } + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a concurrent AIL + * push cannot grab the half-baked leaf buffer and run into problems + * with the write verifier. Once we're done rolling the transaction we + * can release the hold and add the attr to the leaf. + */ + xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_defer_finish(&args->trans); + xfs_trans_bhold_release(args->trans, *leaf_bp); + if (error) { + xfs_trans_brelse(args->trans, *leaf_bp); + return error; + } + + return 0; } /* @@ -258,61 +272,94 @@ xfs_attr_set_args( { struct xfs_inode *dp = args->dp; struct xfs_buf *leaf_bp = NULL; - int error; + int error = 0; /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. + * If the attribute list is already in leaf format, jump straight to + * leaf handling. Otherwise, try to add the attribute to the shortform + * list; if there's no room then convert the list to leaf format and try + * again. */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (xfs_attr_is_shortform(dp)) { /* - * Build initial attribute list (if required). + * If the attr was successfully set in shortform, the + * transaction is committed and set to NULL. Otherwise, is it + * converted from shortform to leaf, and the transaction is + * retained. */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(args); + error = xfs_attr_set_shortform(args, &leaf_bp); + if (error || !args->trans) + return error; + } - /* - * Try to add the attr to the attribute list in the inode. - */ - error = xfs_attr_try_sf_addname(dp, args); + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_addname(args); if (error != -ENOSPC) return error; /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. + * Promote the attribute list to the Btree format. */ - error = xfs_attr_shortform_to_leaf(args, &leaf_bp); + error = xfs_attr3_leaf_to_node(args); if (error) return error; /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - * Once we're done rolling the transaction we can release - * the hold and add the attr to the leaf. + * Finish any deferred work items and roll the transaction once + * more. The goal here is to call node_addname with the inode + * and transaction in the same state (inode locked and joined, + * transaction clean) no matter how we got to this step. */ - xfs_trans_bhold(args->trans, leaf_bp); error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, leaf_bp); - if (error) { - xfs_trans_brelse(args->trans, leaf_bp); + if (error) + return error; + + /* + * Commit the current trans (including the inode) and + * start a new one. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) return error; - } } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) - error = xfs_attr_leaf_addname(args); - else - error = xfs_attr_node_addname(args); + error = xfs_attr_node_addname(args); return error; } /* + * Return EEXIST if attr is found, or ENOATTR if not + */ +int +xfs_has_attr( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int error; + + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + return xfs_attr_sf_findname(args, NULL, NULL); + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_hasname(args, &bp); + + if (bp) + xfs_trans_brelse(args->trans, bp); + + return error; + } + + return xfs_attr_node_hasname(args, NULL); +} + +/* * Remove the attribute specified in @args. */ int @@ -324,7 +371,7 @@ xfs_attr_remove_args( if (!xfs_inode_hasattr(dp)) { error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(args); } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { @@ -336,188 +383,140 @@ xfs_attr_remove_args( return error; } +/* + * Note: If args->value is NULL the attribute will be removed, just like the + * Linux ->setattr API. + */ int xfs_attr_set( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - unsigned char *value, - int valuelen, - int flags) + struct xfs_da_args *args) { + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; struct xfs_trans_res tres; - int rsvd = (flags & ATTR_ROOT) != 0; + bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; - - XFS_STATS_INC(mp, xs_attr_set); + unsigned int total; if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; - - args.value = value; - args.valuelen = valuelen; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp); if (error) return error; - /* - * If the inode doesn't have an attribute fork, add one. - * (inode must not be locked when we call this routine) - */ - if (XFS_IFORK_Q(dp) == 0) { - int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); - if (error) - return error; - } - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - - /* - * Root fork attributes can use reserved data blocks for this - * operation if necessary - */ - error = xfs_trans_alloc(mp, &tres, args.total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args); - if (error) - goto out_trans_cancel; - if (!args.trans) { - /* shortform attribute has already been committed */ - goto out_unlock; - } - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* - * Commit the last in the sequence of transactions. + * We have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail as well. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); -out_unlock: - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; - -out_trans_cancel: - if (args.trans) - xfs_trans_cancel(args.trans); - goto out_unlock; -} + args->op_flags = XFS_DA_OP_OKNOENT; -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -int -xfs_attr_remove( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) -{ - struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; - int error; + if (args->value) { + XFS_STATS_INC(mp, xs_attr_set); - XFS_STATS_INC(mp, xs_attr_remove); + args->op_flags |= XFS_DA_OP_ADDNAME; + args->total = xfs_attr_calc_size(args, &local); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return -EIO; + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(struct xfs_attr_sf_hdr) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + args->valuelen); - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } - /* - * we have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail. - */ - args.op_flags = XFS_DA_OP_OKNOENT; + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + total = args->total; + } else { + XFS_STATS_INC(mp, xs_attr_remove); - error = xfs_qm_dqattach(dp); - if (error) - return error; + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0, - (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, - &args.trans); + error = xfs_trans_alloc(mp, &tres, total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); if (error) return error; xfs_ilock(dp, XFS_ILOCK_EXCL); - /* - * No need to make quota reservations here. We expect to release some - * blocks not allocate in the common case. - */ - xfs_trans_ijoin(args.trans, dp, 0); + xfs_trans_ijoin(args->trans, dp, 0); + if (args->value) { + unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; + + if (rsvd) + quota_flags |= XFS_QMOPT_FORCE_RES; + error = xfs_trans_reserve_quota_nblks(args->trans, dp, + args->total, 0, quota_flags); + if (error) + goto out_trans_cancel; - error = xfs_attr_remove_args(&args); - if (error) - goto out; + error = xfs_has_attr(args); + if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) + goto out_trans_cancel; + if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_trans_cancel; + if (error != -ENOATTR && error != -EEXIST) + goto out_trans_cancel; + + error = xfs_attr_set_args(args); + if (error) + goto out_trans_cancel; + /* shortform attribute has already been committed */ + if (!args->trans) + goto out_unlock; + } else { + error = xfs_has_attr(args); + if (error != -EEXIST) + goto out_trans_cancel; + + error = xfs_attr_remove_args(args); + if (error) + goto out_trans_cancel; + } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); + xfs_trans_set_sync(args->trans); - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + if (!(args->op_flags & XFS_DA_OP_NOTIME)) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args->trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: - if (args.trans) - xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; +out_trans_cancel: + if (args->trans) + xfs_trans_cancel(args->trans); + goto out_unlock; } /*======================================================================== @@ -536,10 +535,10 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) trace_xfs_attr_sf_addname(args); retval = xfs_attr_shortform_lookup(args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); if (retval) @@ -549,7 +548,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * that the leaf format add routine won't trip over the attr * not being around. */ - args->flags &= ~ATTR_REPLACE; + args->attr_flags &= ~XATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -572,54 +571,65 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ +/* Store info about a remote block */ +STATIC void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; +} + +/* Set stored info about a remote block */ +STATIC void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + /* - * Add a name to the leaf attribute list structure + * Tries to add an attribute to an inode in leaf form * - * This leaf block cannot have a "remote" value, we only call this routine - * if bmap_one_block() says there is only one block (ie: no remote blks). + * This function is meant to execute as part of a delayed operation and leaves + * the transaction handling to the caller. On success the attribute is added + * and the inode and transaction are left dirty. If there is not enough space, + * the attr data is converted to node format and -ENOSPC is returned. Caller is + * responsible for handling the dirty inode and transaction or adding the attr + * in node format. */ STATIC int -xfs_attr_leaf_addname( - struct xfs_da_args *args) +xfs_attr_leaf_try_add( + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_inode *dp; - struct xfs_buf *bp; - int retval, error, forkoff; - - trace_xfs_attr_leaf_addname(args); - - /* - * Read the (only) block in the attribute list in. - */ - dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; + int retval; /* * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr3_leaf_lookup_int(bp, args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - xfs_trans_brelse(args->trans, bp); + retval = xfs_attr_leaf_hasname(args, &bp); + if (retval != -ENOATTR && retval != -EEXIST) return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_trans_brelse(args->trans, bp); - return retval; - } + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_brelse; + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) + goto out_brelse; trace_xfs_attr_leaf_replace(args); /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + xfs_attr_save_rmt_blk(args); /* * clear the remote attr state now that it is saved so that the @@ -632,37 +642,35 @@ xfs_attr_leaf_addname( } /* - * Add the attribute to the leaf block, transitioning to a Btree - * if required. + * Add the attribute to the leaf block */ - retval = xfs_attr3_leaf_add(bp, args); - if (retval == -ENOSPC) { - /* - * Promote the attribute list to the Btree format, then - * Commit that transaction so that the node_addname() call - * can manage its own transactions. - */ - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + return xfs_attr3_leaf_add(bp, args); - /* - * Commit the current trans (including the inode) and start - * a new one. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return retval; +} - /* - * Fob the whole rest of the problem off on the Btree code. - */ - error = xfs_attr_node_addname(args); + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_addname( + struct xfs_da_args *args) +{ + int error, forkoff; + struct xfs_buf *bp = NULL; + struct xfs_inode *dp = args->dp; + + trace_xfs_attr_leaf_addname(args); + + error = xfs_attr_leaf_try_add(args, bp); + if (error) return error; - } /* * Commit the transaction that added the attr name so that @@ -684,71 +692,92 @@ xfs_attr_leaf_addname( return error; } - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { + if (!(args->op_flags & XFS_DA_OP_RENAME)) { /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr3_leaf_flipflags(args); + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + return error; + + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); if (error) return error; - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Read in the block containing the "old" attr, then - * remove the "old" attr from that block (neat, huh!) - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); + error = xfs_attr_rmtval_remove(args); if (error) return error; + } - xfs_attr3_leaf_remove(bp, args); + /* + * Read in the block containing the "old" attr, then remove the "old" + * attr from that block (neat, huh!) + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; - /* - * If the result is small enough, shrink it all into the inode. - */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } + xfs_attr3_leaf_remove(bp, args); - /* - * Commit the remove and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, dp); + /* + * If the result is small enough, shrink it all into the inode. + */ + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + + return error; +} + +/* + * Return EEXIST if attr is found, or ENOATTR if not + */ +STATIC int +xfs_attr_leaf_hasname( + struct xfs_da_args *args, + struct xfs_buf **bp) +{ + int error = 0; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(*bp, args); + if (error != -ENOATTR && error != -EEXIST) + xfs_trans_brelse(args->trans, *bp); - } else if (args->rmtblkno > 0) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - error = xfs_attr3_leaf_clearflag(args); - } return error; } @@ -772,31 +801,25 @@ xfs_attr_leaf_removename( * Remove the attribute. */ dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; - error = xfs_attr3_leaf_lookup_int(bp, args); + error = xfs_attr_leaf_hasname(args, &bp); + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + return xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } + return 0; } @@ -816,21 +839,53 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; + error = xfs_attr_leaf_hasname(args, &bp); - error = xfs_attr3_leaf_lookup_int(bp, args); - if (error != -EEXIST) { + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; + + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); return error; } +/* + * Return EEXIST if attr is found, or ENOATTR if not + * statep: If not null is set to point at the found state. Caller will + * be responsible for freeing the state in this case. + */ +STATIC int +xfs_attr_node_hasname( + struct xfs_da_args *args, + struct xfs_da_state **statep) +{ + struct xfs_da_state *state; + int retval, error; + + state = xfs_da_state_alloc(args); + if (statep != NULL) + *statep = NULL; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) { + xfs_da_state_free(state); + return error; + } + + if (statep != NULL) + *statep = state; + else + xfs_da_state_free(state); + return retval; +} + /*======================================================================== * External routines when attribute list size > geo->blksize *========================================================================*/ @@ -852,7 +907,6 @@ xfs_attr_node_addname( struct xfs_da_state *state; struct xfs_da_state_blk *blk; struct xfs_inode *dp; - struct xfs_mount *mp; int retval, error; trace_xfs_attr_node_addname(args); @@ -861,36 +915,28 @@ xfs_attr_node_addname( * Fill in bucket of arguments/results/context to carry around. */ dp = args->dp; - mp = dp->i_mount; restart: - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; - /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) + retval = xfs_attr_node_hasname(args, &state); + if (retval != -ENOATTR && retval != -EEXIST) goto out; + blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto out; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) goto out; trace_xfs_attr_node_replace(args); /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + xfs_attr_save_rmt_blk(args); /* * clear the remote attr state now that it is saved so that the @@ -976,82 +1022,75 @@ restart: return error; } - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { + if (!(args->op_flags & XFS_DA_OP_RENAME)) { /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + retval = error; + goto out; + } - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in series + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + goto out; - /* - * Re-find the "old" attribute entry after any split ops. - * The INCOMPLETE flag means that we will find the "old" - * attr, not the "new" one. - */ - args->op_flags |= XFS_DA_OP_INCOMPLETE; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); if (error) - goto out; + return error; - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } + /* + * Re-find the "old" attribute entry after any split ops. The INCOMPLETE + * flag means that we will find the "old" attr, not the "new" one. + */ + args->attr_filter |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(args); + state->inleaf = 0; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; - /* - * Commit and start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); - } else if (args->rmtblkno > 0) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - error = xfs_attr3_leaf_clearflag(args); + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); if (error) goto out; } @@ -1066,6 +1105,114 @@ out: } /* + * Shrink an attribute from leaf to shortform + */ +STATIC int +xfs_attr_node_shrink( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_inode *dp = args->dp; + int error, forkoff; + struct xfs_buf *bp; + + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else + xfs_trans_brelse(args->trans, bp); + + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +STATIC int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; + + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + return error; + + /* + * Mark the attribute as INCOMPLETE + */ + return xfs_attr3_leaf_setflag(args); +} + +/* + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. + */ +STATIC +int xfs_attr_node_removename_setup( + struct xfs_da_args *args, + struct xfs_da_state **state) +{ + int error; + + error = xfs_attr_node_hasname(args, state); + if (error != -EEXIST) + return error; + + ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); + ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); + + if (args->rmtblkno > 0) { + error = xfs_attr_leaf_mark_incomplete(args, *state); + if (error) + return error; + + return xfs_attr_rmtval_invalidate(args); + } + + return 0; +} + +STATIC int +xfs_attr_node_remove_rmt( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error = 0; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + + /* + * Refill the state structure with buffers, the prior calls released our + * buffers. + */ + return xfs_attr_refillstate(state); +} + +/* * Remove a name from a B-tree attribute list. * * This will involve walking down the Btree, and may involve joining @@ -1078,64 +1225,22 @@ xfs_attr_node_removename( { struct xfs_da_state *state; struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - struct xfs_buf *bp; - int retval, error, forkoff; + int retval, error; + struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); - /* - * Tie a string around our finger to remind us where we are. - */ - dp = args->dp; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = dp->i_mount; - - /* - * Search to see if name exists, and get back a pointer to it. - */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error || (retval != -EEXIST)) { - if (error == 0) - error = retval; + error = xfs_attr_node_removename_setup(args, &state); + if (error) goto out; - } /* * If there is an out-of-line value, de-allocate the blocks. * This is done before we remove the attribute so that we don't * overflow the maximum size of a transaction and/or hit a deadlock. */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (args->rmtblkno > 0) { - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - goto out; - - /* - * Mark the attribute as INCOMPLETE, then bunmapi() the - * remote value. - */ - error = xfs_attr3_leaf_setflag(args); - if (error) - goto out; - error = xfs_attr_rmtval_remove(args); - if (error) - goto out; - - /* - * Refill the state structure with buffers, the prior calls - * released our buffers. - */ - error = xfs_attr_refillstate(state); + error = xfs_attr_node_remove_rmt(args, state); if (error) goto out; } @@ -1169,33 +1274,12 @@ xfs_attr_node_removename( /* * If the result is small enough, push it all into the inode. */ - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - goto out; - - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } else - xfs_trans_brelse(args->trans, bp); - } - error = 0; + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_node_shrink(args, state); out: - xfs_da_state_free(state); + if (state) + xfs_da_state_free(state); return error; } @@ -1311,47 +1395,41 @@ xfs_attr_refillstate(xfs_da_state_t *state) * Returns 0 on successful retrieval, otherwise an error. */ STATIC int -xfs_attr_node_get(xfs_da_args_t *args) +xfs_attr_node_get( + struct xfs_da_args *args) { - xfs_da_state_t *state; - xfs_da_state_blk_t *blk; - int error, retval; - int i; + struct xfs_da_state *state; + struct xfs_da_state_blk *blk; + int i; + int error; trace_xfs_attr_node_get(args); - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; - /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) { - retval = error; - goto out_release; - } - if (retval != -EEXIST) + error = xfs_attr_node_hasname(args, &state); + if (error != -EEXIST) goto out_release; /* * Get the value, local or "remote" */ blk = &state->path.blk[state->path.active - 1]; - retval = xfs_attr3_leaf_getvalue(blk->bp, args); + error = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. */ out_release: - for (i = 0; i < state->path.active; i++) { + for (i = 0; state != NULL && i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } - xfs_da_state_free(state); - return retval; + if (state) + xfs_da_state_free(state); + return error; } /* Returns true if the attribute entry name is valid. */ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 4243b2272642..3e97a935e712 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -21,39 +21,6 @@ struct xfs_attr_list_context; * as possible so as to fit into the literal area of the inode. */ -/*======================================================================== - * External interfaces - *========================================================================*/ - - -#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ -#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ -#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ -#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ -#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ -#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ - -#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ -#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ - -#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ - -#define ATTR_KERNEL_FLAGS \ - (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) - -#define XFS_ATTR_FLAGS \ - { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ - { ATTR_ROOT, "ROOT" }, \ - { ATTR_TRUST, "TRUST" }, \ - { ATTR_SECURE, "SECURE" }, \ - { ATTR_CREATE, "CREATE" }, \ - { ATTR_REPLACE, "REPLACE" }, \ - { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" }, \ - { ATTR_ALLOC, "ALLOC" } - /* * The maximum size (into the kernel or returned from the kernel) of an * attribute value or the buffer used for an attr_list() call. Larger @@ -62,45 +29,16 @@ struct xfs_attr_list_context; #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ /* - * Define how lists of attribute names are returned to the user from - * the attr_list() call. A large, 32bit aligned, buffer is passed in - * along with its size. We put an array of offsets at the top that each - * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. - */ -typedef struct attrlist { - __s32 al_count; /* number of entries in attrlist */ - __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ -} attrlist_t; - -/* - * Show the interesting info about one attribute. This is what the - * al_offset[i] entry points to. - */ -typedef struct attrlist_ent { /* data from attr_list() */ - __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ -} attrlist_ent_t; - -/* - * Given a pointer to the (char*) buffer containing the attr_list() result, - * and an index, return a pointer to the indicated attribute in the buffer. - */ -#define ATTR_ENTRY(buffer, index) \ - ((attrlist_ent_t *) \ - &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) - -/* * Kernel-internal version of the attrlist cursor. */ -typedef struct attrlist_cursor_kern { +struct xfs_attrlist_cursor_kern { __u32 hashval; /* hash value of next entry to add */ __u32 blkno; /* block containing entry (suggestion) */ __u32 offset; /* offset in list of equal-hashvals */ __u16 pad1; /* padding to match user-level */ __u8 pad2; /* padding to match user-level */ __u8 initted; /* T/F: cursor has been initialized */ -} attrlist_cursor_kern_t; +}; /*======================================================================== @@ -112,27 +50,28 @@ typedef struct attrlist_cursor_kern { typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); -typedef struct xfs_attr_list_context { - struct xfs_trans *tp; - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - char *alist; /* output buffer */ +struct xfs_attr_list_context { + struct xfs_trans *tp; + struct xfs_inode *dp; /* inode */ + struct xfs_attrlist_cursor_kern cursor; /* position in list */ + void *buffer; /* output buffer */ /* * Abort attribute list iteration if non-zero. Can be used to pass * error values to the xfs_attr_list caller. */ - int seen_enough; + int seen_enough; + bool allow_incomplete; - ssize_t count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */ + int resynch; /* T/F: resynch with cursor */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +}; /*======================================================================== @@ -143,21 +82,15 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); -int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_attr_list_ilocked(struct xfs_attr_list_context *); +int xfs_attr_list(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); -int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - size_t namelen, unsigned char **value, int *valuelenp, - int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, unsigned char *value, int valuelen, int flags); +int xfs_attr_get_ilocked(struct xfs_da_args *args); +int xfs_attr_get(struct xfs_da_args *args); +int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, int flags); +int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fed537a4353d..8623c815164a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -309,14 +309,6 @@ xfs_attr3_leaf_verify( return fa; /* - * In recovery there is a transient state where count == 0 is valid - * because we may have transitioned an empty shortform attr to a leaf - * if the attr didn't fit in shortform. - */ - if (!xfs_log_in_recovery(mp) && ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -331,6 +323,13 @@ xfs_attr3_leaf_verify( (char *)bp->b_addr + ichdr.firstused) return __this_address; + /* + * NOTE: This verifier historically failed empty leaf buffers because + * we expect the fork to be in another format. Empty attr fork format + * conversions are possible during xattr set, however, and format + * conversion is not atomic with the xattr set that triggers it. We + * cannot assume leaf blocks are non-empty until that is addressed. + */ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, @@ -445,14 +444,25 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ -/* - * If namespace bits don't match return 0. - * If all match then return 1. - */ -STATIC int -xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +static bool +xfs_attr_match( + struct xfs_da_args *args, + uint8_t namelen, + unsigned char *name, + int flags) { - return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); + if (args->namelen != namelen) + return false; + if (memcmp(args->name, name, namelen) != 0) + return false; + /* + * If we are looking for incomplete entries, show only those, else only + * show complete entries. + */ + if (args->attr_filter != + (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) + return false; + return true; } static int @@ -464,7 +474,7 @@ xfs_attr_copy_value( /* * No copy if all we have to do is get the length */ - if (args->flags & ATTR_KERNOVAL) { + if (!args->valuelen) { args->valuelen = valuelen; return 0; } @@ -477,8 +487,8 @@ xfs_attr_copy_value( return -ERANGE; } - if (args->op_flags & XFS_DA_OP_ALLOCVAL) { - args->value = kmem_alloc_large(valuelen, 0); + if (!args->value) { + args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP); if (!args->value) return -ENOMEM; } @@ -526,9 +536,9 @@ xfs_attr_shortform_bytesfit( int offset; /* rounded down */ - offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + offset = (XFS_LITINO(mp) - bytes) >> 3; - if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { + if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; } @@ -556,7 +566,7 @@ xfs_attr_shortform_bytesfit( dsize = dp->i_df.if_bytes; - switch (dp->i_d.di_format) { + switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* * If there is no attr fork and the data fork is extents, @@ -593,8 +603,7 @@ xfs_attr_shortform_bytesfit( minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -626,22 +635,19 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) * Create the initial contents of a shortform attribute list. */ void -xfs_attr_shortform_create(xfs_da_args_t *args) +xfs_attr_shortform_create( + struct xfs_da_args *args) { - xfs_attr_sf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = dp->i_afp; + struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); - dp = args->dp; - ASSERT(dp != NULL); - ifp = dp->i_afp; - ASSERT(ifp != NULL); ASSERT(ifp->if_bytes == 0); - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) { ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_format = XFS_DINODE_FMT_LOCAL; ifp->if_flags |= XFS_IFINLINE; } else { ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -654,18 +660,65 @@ xfs_attr_shortform_create(xfs_da_args_t *args) } /* + * Return -EEXIST if attr is found, or -ENOATTR if not + * args: args containing attribute name and namelen + * sfep: If not null, pointer will be set to the last attr entry found on + -EEXIST. On -ENOATTR pointer is left at the last entry in the list + * basep: If not null, pointer is set to the byte offset of the entry in the + * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of + * the last entry in the list + */ +int +xfs_attr_sf_findname( + struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep) +{ + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + unsigned int base = sizeof(struct xfs_attr_sf_hdr); + int size = 0; + int end; + int i; + + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + continue; + break; + } + + if (sfep != NULL) + *sfep = sfe; + + if (basep != NULL) + *basep = base; + + if (i == end) + return -ENOATTR; + return -EEXIST; +} + +/* * Add a name/value pair to the shortform attribute list. * Overflow from the inode has already been checked for. */ void -xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +xfs_attr_shortform_add( + struct xfs_da_args *args, + int forkoff) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i, offset, size; - xfs_mount_t *mp; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int offset, size; + struct xfs_mount *mp; + struct xfs_inode *dp; + struct xfs_ifork *ifp; trace_xfs_attr_sf_add(args); @@ -676,18 +729,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { -#ifdef DEBUG - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; + if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); -#endif - } offset = (char *)sfe - (char *)sf; size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); @@ -697,7 +740,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; @@ -716,13 +759,12 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - ip->i_d.di_forkoff = 0; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - - ASSERT(ip->i_d.di_anextents == 0); - ASSERT(ip->i_afp == NULL); + ASSERT(ip->i_afp->if_nextents == 0); + xfs_idestroy_fork(ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + ip->i_d.di_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -730,35 +772,27 @@ xfs_attr_fork_remove( * Remove an attribute from the shortform attribute list structure. */ int -xfs_attr_shortform_remove(xfs_da_args_t *args) +xfs_attr_shortform_remove( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int base, size=0, end, totsize, i; - xfs_mount_t *mp; - xfs_inode_t *dp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int size = 0, end, totsize; + unsigned int base; + struct xfs_mount *mp; + struct xfs_inode *dp; + int error; trace_xfs_attr_sf_remove(args); dp = args->dp; mp = dp->i_mount; - base = sizeof(xfs_attr_sf_hdr_t); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), - base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); - if (sfe->namelen != args->namelen) - continue; - if (memcmp(sfe->nameval, args->name, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - break; - } - if (i == end) - return -ENOATTR; + + error = xfs_attr_sf_findname(args, &sfe, &base); + if (error != -EEXIST) + return error; + size = XFS_ATTR_SF_ENTSIZE(sfe); /* * Fix up the attribute fork data, covering the hole @@ -776,7 +810,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { xfs_attr_fork_remove(dp, args->trans); } else { @@ -786,7 +820,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || - dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -816,13 +850,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return -EEXIST; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return -EEXIST; } return -ENOATTR; } @@ -830,9 +860,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) /* * Retrieve the attribute value and length. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr_shortform_getvalue( @@ -847,14 +877,10 @@ xfs_attr_shortform_getvalue( sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], - sfe->valuelen); + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return xfs_attr_copy_value(args, + &sfe->nameval[args->namelen], sfe->valuelen); } return -ENOATTR; } @@ -918,7 +944,7 @@ xfs_attr_shortform_to_leaf( nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -971,7 +997,7 @@ xfs_attr_shortform_allfit( + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -990,7 +1016,7 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -1094,7 +1120,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); - ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); goto out; } @@ -1124,7 +1150,7 @@ xfs_attr3_leaf_to_shortform( nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; @@ -1449,8 +1475,9 @@ xfs_attr3_leaf_add_work( entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); - entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + entry->flags = args->attr_filter; + if (tmp) + entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && @@ -2346,7 +2373,7 @@ xfs_attr3_leaf_lookup_int( xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2365,11 +2392,11 @@ xfs_attr3_leaf_lookup_int( break; } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2399,33 +2426,17 @@ xfs_attr3_leaf_lookup_int( /* * GROT: Add code to remove incomplete entries. */ - /* - * If we are looking for INCOMPLETE entries, show only those. - * If we are looking for complete entries, show only those. - */ - if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != - !!(entry->flags & XFS_ATTR_INCOMPLETE)) { - continue; - } if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (name_loc->namelen != args->namelen) - continue; - if (memcmp(args->name, name_loc->nameval, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_loc->namelen, + name_loc->nameval, entry->flags)) continue; args->index = probe; return -EEXIST; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (name_rmt->namelen != args->namelen) - continue; - if (memcmp(args->name, name_rmt->name, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_rmt->namelen, + name_rmt->name, entry->flags)) continue; args->index = probe; args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); @@ -2444,9 +2455,9 @@ xfs_attr3_leaf_lookup_int( * Get the value associated with an attribute name from a leaf attribute * list structure. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr3_leaf_getvalue( @@ -2771,10 +2782,7 @@ xfs_attr3_leaf_clearflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* @@ -2822,10 +2830,7 @@ xfs_attr3_leaf_setflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* @@ -2940,10 +2945,5 @@ xfs_attr3_leaf_flipflags( XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - - return error; + return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 73615b1dd1a8..9b1c59f40a26 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. @@ -8,7 +8,6 @@ #define __XFS_ATTR_LEAF_H__ struct attrlist; -struct attrlist_cursor_kern; struct xfs_attr_list_context; struct xfs_da_args; struct xfs_da_state; @@ -53,6 +52,9 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_sf_findname(struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 8b7f74b3bea2..3f80cede7406 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -397,7 +397,7 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); - ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->valuelen != 0); ASSERT(args->rmtvaluelen == args->valuelen); valuelen = args->rmtvaluelen; @@ -440,32 +440,23 @@ xfs_attr_rmtval_get( } /* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. + * Find a "hole" in the attribute address space large enough for us to drop the + * new attribute's value into */ -int -xfs_attr_rmtval_set( +STATIC int +xfs_attr_rmt_find_hole( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - xfs_fileoff_t lfileoff = 0; - uint8_t *src = args->value; - int blkcnt; - int valuelen; - int nmap; int error; - int offset = 0; - - trace_xfs_attr_rmtval_set(args); + int blkcnt; + xfs_fileoff_t lfileoff = 0; /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. Because CRC enable - * attributes have headers, we can't just do a straight byte to FSB - * conversion and have to take the header space into account. + * Because CRC enable attributes have headers, we can't just do a + * straight byte to FSB conversion and have to take the header space + * into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, @@ -473,48 +464,26 @@ xfs_attr_rmtval_set( if (error) return error; - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - * - * Note that we have to consider this a data allocation as we - * write the remote attribute without logging the contents. - * Hence we must ensure that we aren't using blocks that are on - * the busy list so that we don't overwrite blocks which have - * recently been freed but their transactions are not yet - * committed to disk. If we overwrite the contents of a busy - * extent and then crash then the block may not contain the - * correct metadata after log recovery occurs. - */ - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, - &nmap); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; + return 0; +} - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - } +STATIC int +xfs_attr_rmtval_set_value( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -595,19 +564,82 @@ xfs_attr_rmtval_stale( } /* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + int blkcnt; + int nmap; + int error; + + trace_xfs_attr_rmtval_set(args); + + error = xfs_attr_rmt_find_hole(args); + if (error) + return error; + + blkcnt = args->rmtblkcnt; + lblkno = (xfs_dablk_t)args->rmtblkno; + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + /* + * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. + */ + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, + &nmap); + if (error) + return error; + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + } + + return xfs_attr_rmtval_set_value(args); +} + +/* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove( +xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; int blkcnt; int error; - int done; - - trace_xfs_attr_rmtval_remove(args); /* * Roll through the "value", invalidating the attribute value's blocks. @@ -635,21 +667,29 @@ xfs_attr_rmtval_remove( lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + int error; + int retval; + + trace_xfs_attr_rmtval_remove(args); /* * Keep de-allocating extents until the remote-value region is gone. */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK, 1, &done); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + do { + retval = __xfs_attr_rmtval_remove(args); + if (retval && retval != -EAGAIN) + return retval; /* * Close out trans and start the next one in the chain. @@ -657,6 +697,36 @@ xfs_attr_rmtval_remove( error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; - } + } while (retval == -EAGAIN); + return 0; } + +/* + * Remove the value associated with an attribute by deleting the out-of-line + * buffer that it is stored on. Returns EAGAIN for the caller to refresh the + * transaction and re-call the function + */ +int +__xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + int error, done; + + /* + * Unmap value blocks for this attr. + */ + error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno, + args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done); + if (error) + return error; + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + if (!done) + return -EAGAIN; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 6fb4572845ce..9eee615da156 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. @@ -13,5 +13,6 @@ int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); - +int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); +int __xfs_attr_rmtval_remove(struct xfs_da_args *args); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index aafa4fe70624..bb004fb7944a 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index 99017b8df292..a04f266ae644 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9a6d7a84689a..9c40d5971035 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -61,10 +61,10 @@ xfs_bmap_compute_maxlevels( int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). + * The maximum number of extents in a file, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count, + * either a signed 32-bit number for the data fork, or a signed 16-bit + * number for the attr fork. * * Note that we can no longer assume that if we are in ATTR1 that * the fork offset of all the inodes will be @@ -120,10 +120,11 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -131,10 +132,11 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_BTREE && + ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -193,14 +195,12 @@ xfs_default_attroffset( struct xfs_mount *mp = ip->i_mount; uint offset; - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp, ip->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { + if (mp->m_sb.sb_inodesize == 256) + offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + else offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + ASSERT(offset < XFS_LITINO(mp)); return offset; } @@ -215,8 +215,8 @@ xfs_bmap_forkoff_reset( int whichfork) { if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + ip->i_df.if_format != XFS_DINODE_FMT_DEV && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; if (dfl_forkoff > ip->i_d.di_forkoff) @@ -317,31 +317,28 @@ xfs_bmap_check_leaf_extents( xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ xfs_extnum_t i=0, j; /* index into the extents list */ - struct xfs_ifork *ifp; /* fork structure */ int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return; - } /* skip large extent count inodes */ - if (ip->i_d.di_nextents > 10000) + if (ip->i_df.if_nextents > 10000) return; bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -556,7 +553,8 @@ __xfs_bmap_add_free( #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_alloc(xfs_bmap_free_item_zone, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = bno; new->xefi_blockcount = (xfs_extlen_t)len; if (oinfo) @@ -606,7 +604,7 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); @@ -634,7 +632,7 @@ xfs_bmap_btree_to_extents( xfs_iroot_realloc(ip, -1, whichfork); ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -670,7 +668,7 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* * Make space in the inode incore. This needs to be undone if we fail @@ -690,11 +688,11 @@ xfs_bmap_extents_to_btree( * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + ifp->if_format = XFS_DINODE_FMT_BTREE; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; @@ -727,7 +725,7 @@ xfs_bmap_extents_to_btree( ASSERT(tp->t_firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; + cur->bc_ino.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, @@ -752,7 +750,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_disk_set_all(arp, &rec); cnt++; } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + ASSERT(cnt == ifp->if_nextents); xfs_btree_set_numrecs(ablock, cnt); /* @@ -780,7 +778,7 @@ out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -802,16 +800,16 @@ xfs_bmap_local_to_extents_empty( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; ifp->if_u1.if_root = NULL; ifp->if_height = 0; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -842,7 +840,7 @@ xfs_bmap_local_to_extents( */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { xfs_bmap_local_to_extents_empty(tp, ip, whichfork); @@ -909,7 +907,7 @@ xfs_bmap_local_to_extents( xfs_iext_first(ifp, &icur); xfs_iext_insert(ip, &icur, &rec, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ifp->if_nextents = 1; ip->i_d.di_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); @@ -953,7 +951,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -974,13 +972,14 @@ xfs_bmap_add_attrfork_extents( xfs_btree_cur_t *cur; /* bmap btree cursor */ int error; /* error return value */ - if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= + XFS_IFORK_DSIZE(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -1035,7 +1034,7 @@ xfs_bmap_set_attrforkoff( int size, int *version) { - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; break; @@ -1093,17 +1092,6 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) { - error = -EFSCORRUPTED; - goto trans_cancel; - } - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1111,10 +1099,14 @@ xfs_bmap_add_attrfork( if (error) goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); + + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, + GFP_KERNEL | __GFP_NOFAIL); + + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &logflags); break; @@ -1178,20 +1170,20 @@ xfs_iread_bmbt_block( { struct xfs_iread_state *ir = priv; struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_btree_block *block; struct xfs_buf *bp; struct xfs_bmbt_rec *frp; xfs_extnum_t num_recs; xfs_extnum_t j; - int whichfork = cur->bc_private.b.whichfork; + int whichfork = cur->bc_ino.whichfork; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); /* Abort if we find more records than nextents. */ num_recs = xfs_btree_get_numrecs(block); - if (unlikely(ir->loaded + num_recs > - XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) { xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, @@ -1217,7 +1209,7 @@ xfs_iread_bmbt_block( xfs_bmap_fork_to_state(whichfork)); trace_xfs_read_extent(ip, &ir->icur, xfs_bmap_fork_to_state(whichfork), _THIS_IP_); - xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur); + xfs_iext_next(ifp, &ir->icur); } return 0; @@ -1240,9 +1232,7 @@ xfs_iread_extents( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, - XFS_IFORK_FORMAT(ip, whichfork) != - XFS_DINODE_FMT_BTREE)) { + if (XFS_IS_CORRUPT(mp, ifp->if_format != XFS_DINODE_FMT_BTREE)) { error = -EFSCORRUPTED; goto out; } @@ -1256,8 +1246,7 @@ xfs_iread_extents( if (error) goto out; - if (XFS_IS_CORRUPT(mp, - ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { error = -EFSCORRUPTED; goto out; } @@ -1291,14 +1280,13 @@ xfs_bmap_first_unused( xfs_fileoff_t lowest, max; int error; - ASSERT(xfs_ifork_has_extents(ip, whichfork) || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } + ASSERT(xfs_ifork_has_extents(ifp)); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) @@ -1339,7 +1327,7 @@ xfs_bmap_last_before( struct xfs_iext_cursor icur; int error; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; @@ -1438,16 +1426,17 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; *last_block = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); @@ -1465,23 +1454,22 @@ xfs_bmap_last_offset( */ int /* 1=>1 block, 0=>otherwise */ xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ + struct xfs_inode *ip, /* incore inode */ + int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int rval; /* return value */ + struct xfs_bmbt_irec s; /* internal version of extent */ struct xfs_iext_cursor icur; #ifndef DEBUG if (whichfork == XFS_DATA_FORK) return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + if (ifp->if_nextents != 1) return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_flags & XFS_IFEXTENTS); xfs_iext_first(ifp, &icur); xfs_iext_get_extent(ifp, &icur, &s); @@ -1503,10 +1491,11 @@ xfs_bmap_add_extent_delay_real( struct xfs_bmalloca *bma, int whichfork) { + struct xfs_mount *mp = bma->ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ - struct xfs_ifork *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ @@ -1516,19 +1505,12 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - struct xfs_mount *mp; - xfs_extnum_t *nextents; struct xfs_bmbt_irec old; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(whichfork != XFS_ATTR_FORK); - nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : - &bma->ip->i_d.di_nextents); - ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || - (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1616,7 +1598,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); - (*nextents)--; + ifp->if_nextents--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1720,8 +1702,8 @@ xfs_bmap_add_extent_delay_real( PREV.br_startblock = new->br_startblock; PREV.br_state = new->br_state; xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + ifp->if_nextents++; - (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1786,7 +1768,8 @@ xfs_bmap_add_extent_delay_real( * The left neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1818,7 +1801,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1872,7 +1855,8 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1904,7 +1888,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -1957,7 +1941,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_next(ifp, &bma->icur); xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); - (*nextents)++; + ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -2025,8 +2009,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_private.b.allocated; - bma->cur->bc_private.b.allocated = 0; + da_new += bma->cur->bc_ino.allocated; + bma->cur->bc_ino.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2161,8 +2145,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 2); + ifp->if_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2214,8 +2197,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2257,9 +2239,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &PREV); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2366,8 +2347,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2442,9 +2423,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, new, state); + ifp->if_nextents++; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2495,9 +2475,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &r[1], state); xfs_iext_insert(ip, icur, &r[0], state); + ifp->if_nextents += 2; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2573,7 +2552,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; *curp = cur; } @@ -2752,7 +2731,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2812,9 +2791,8 @@ xfs_bmap_add_extent_hole_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -2912,8 +2890,8 @@ xfs_bmap_add_extent_hole_real( * Insert a new entry. */ xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -2955,7 +2933,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -3893,7 +3871,8 @@ xfs_bmapi_read( int flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3901,48 +3880,23 @@ xfs_bmapi_read( int error; bool eof = false; int n = 0; - int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); - ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_COWFORK))); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + if (WARN_ON_ONCE(!ifp)) + return -EFSCORRUPTED; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) return -EFSCORRUPTED; - } if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!ifp) { - /* No CoW fork? Return a hole. */ - if (whichfork == XFS_COW_FORK) { - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = len; - mval->br_state = XFS_EXT_NORM; - *nmap = 1; - return 0; - } - - /* - * A missing attr ifork implies that the inode says we're in - * extents or btree format but failed to pass the inode fork - * verifier while trying to load it. Treat that as a file - * corruption too. - */ -#ifdef DEBUG - xfs_alert(mp, "%s: inode %llu missing fork %d", - __func__, ip->i_ino, whichfork); -#endif /* DEBUG */ - return -EFSCORRUPTED; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4187,25 +4141,15 @@ xfs_bmapi_allocate( bma->nallocs++; if (bma->cur) - bma->cur->bc_private.b.flags = - bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + bma->cur->bc_ino.flags = + bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; bma->got.br_blockcount = bma->length; bma->got.br_state = XFS_EXT_NORM; - /* - * In the data fork, a wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - * - * For the cow fork, however, we convert delalloc reservations - * (extents allocated for speculative preallocation) to - * allocated unwritten extents, and only convert the unwritten - * extents to real extents when we're about to write the data. - */ - if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC)) + if (bma->flags & XFS_BMAPI_PREALLOC) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -4319,11 +4263,13 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork); + if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; - if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; - return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; + return be16_to_cpu(ifp->if_broot->bb_level) + 1; } /* @@ -4338,11 +4284,13 @@ xfs_bmapi_finish( int whichfork, int error) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + if ((bma->logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) bma->logflags &= ~xfs_ilog_fext(whichfork); else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) bma->logflags &= ~xfs_ilog_fbroot(whichfork); if (bma->logflags) @@ -4374,13 +4322,13 @@ xfs_bmapi_write( .total = total, }; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ - int whichfork; /* data or attr fork */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4395,13 +4343,12 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(tp != NULL); ASSERT(len > 0); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP)); @@ -4417,7 +4364,7 @@ xfs_bmapi_write( ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4425,8 +4372,6 @@ xfs_bmapi_write( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); - XFS_STATS_INC(mp, xs_blk_mapw); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -4536,9 +4481,8 @@ xfs_bmapi_write( if (error) goto error0; - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork)); + ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE || + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, orig_nmap, *nmap); @@ -4613,8 +4557,23 @@ xfs_bmapi_convert_delalloc( bma.offset = bma.got.br_startoff; bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + + /* + * When we're converting the delalloc reservations backing dirty pages + * in the page cache, we must be careful about how we create the new + * extents: + * + * New CoW fork extents are created unwritten, turned into real extents + * when we're about to write the data to disk, and mapped into the data + * fork after the write finishes. End of story. + * + * New data fork extents must be mapped in as unwritten and converted + * to real extents after the write succeeds to avoid exposing stale + * disk contents if we crash. + */ + bma.flags = XFS_BMAPI_PREALLOC; if (whichfork == XFS_COW_FORK) - bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + bma.flags |= XFS_BMAPI_COWFORK; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; @@ -4684,7 +4643,7 @@ xfs_bmapi_remap( ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4709,7 +4668,7 @@ xfs_bmapi_remap( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } got.br_startoff = bno; @@ -4728,9 +4687,9 @@ xfs_bmapi_remap( error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~XFS_ILOG_DEXT; - else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE) logflags &= ~XFS_ILOG_DBROOT; if (logflags) @@ -5080,9 +5039,8 @@ xfs_bmap_del_extent_real( * conversion to btree format, since the transaction will be dirty then. */ if (tp->t_blk_res == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= - XFS_IFORK_MAXEXT(ip, whichfork) && + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) && del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; @@ -5134,8 +5092,8 @@ xfs_bmap_del_extent_real( */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; + flags |= XFS_ILOG_CORE; if (!cur) { flags |= xfs_ilog_fext(whichfork); @@ -5243,8 +5201,8 @@ xfs_bmap_del_extent_real( } } else flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + ifp->if_nextents++; xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); break; @@ -5324,7 +5282,7 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -5362,9 +5320,9 @@ __xfs_bunmapi( logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } else cur = NULL; @@ -5607,10 +5565,10 @@ error0: * logging the extent records if we've converted to btree format. */ if ((logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~xfs_ilog_fext(whichfork); else if ((logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction @@ -5620,7 +5578,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5692,6 +5650,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5710,8 +5669,7 @@ xfs_bmse_merge( * Update the on-disk extent count, the btree if necessary and log the * inode. */ - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; @@ -5749,7 +5707,7 @@ xfs_bmse_merge( done: xfs_iext_remove(ip, icur, 0); - xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); + xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, &new); @@ -5821,7 +5779,7 @@ xfs_bmap_collapse_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5839,7 +5797,7 @@ xfs_bmap_collapse_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { @@ -5938,7 +5896,7 @@ xfs_bmap_insert_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5956,7 +5914,7 @@ xfs_bmap_insert_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (*next_fsb == NULLFSBLOCK) { @@ -6025,25 +5983,25 @@ del_cursor: * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ -STATIC int -xfs_bmap_split_extent_at( +int +xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; xfs_fsblock_t gotblkcnt; /* new block count for got */ struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -6051,7 +6009,6 @@ xfs_bmap_split_extent_at( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -6074,7 +6031,7 @@ xfs_bmap_split_extent_at( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; @@ -6099,8 +6056,7 @@ xfs_bmap_split_extent_at( /* Add new extent */ xfs_iext_next(ifp, &icur); xfs_iext_insert(ip, &icur, &new, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; if (cur) { error = xfs_bmbt_lookup_eq(cur, &new, &i); @@ -6133,7 +6089,7 @@ xfs_bmap_split_extent_at( del_cursor: if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6142,34 +6098,6 @@ del_cursor: return error; } -int -xfs_bmap_split_extent( - struct xfs_inode *ip, - xfs_fileoff_t split_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - error = xfs_bmap_split_extent_at(tp, ip, split_fsb); - if (error) - goto out; - - return xfs_trans_commit(tp); - -out: - xfs_trans_cancel(tp); - return error; -} - /* Deferred mapping is only for real extents in the data fork. */ static bool xfs_bmap_is_update_needed( diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 14d25e0b7d9c..e1bd484e5548 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. @@ -158,17 +158,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } +/* Return true if the extent is an allocated extent, written or not. */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) { - return irec->br_state != XFS_EXT_UNWRITTEN && - irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !isnullstartblock(irec->br_startblock); + return xfs_bmap_is_real_extent(irec) && + irec->br_state != XFS_EXT_UNWRITTEN; } /* @@ -222,7 +227,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); -int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ffe608d2a2d9..ecec604e6e4d 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -166,13 +166,13 @@ xfs_bmbt_dup_cursor( struct xfs_btree_cur *new; new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.b.ip, cur->bc_private.b.whichfork); + cur->bc_ino.ip, cur->bc_ino.whichfork); /* * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ - new->bc_private.b.flags = cur->bc_private.b.flags; + new->bc_ino.flags = cur->bc_ino.flags; return new; } @@ -183,12 +183,12 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *dst) { ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || - (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); - dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_ino.allocated += src->bc_ino.allocated; dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; - src->bc_private.b.allocated = 0; + src->bc_ino.allocated = 0; } STATIC int @@ -205,8 +205,8 @@ xfs_bmbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.fsbno = cur->bc_tp->t_firstblock; - xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -230,7 +230,7 @@ xfs_bmbt_alloc_block( } args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) { error = -ENOSPC; goto error0; @@ -259,10 +259,10 @@ xfs_bmbt_alloc_block( ASSERT(args.len == 1); cur->bc_tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + cur->bc_ino.allocated++; + cur->bc_ino.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, XFS_TRANS_DQ_BCOUNT, 1L); new->l = cpu_to_be64(args.fsbno); @@ -280,12 +280,12 @@ xfs_bmbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); struct xfs_owner_info oinfo; - xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); ip->i_d.di_nblocks--; @@ -302,8 +302,8 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -320,8 +320,8 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -347,7 +347,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); + return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0); } STATIC void @@ -552,7 +552,7 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; @@ -566,11 +566,11 @@ xfs_bmbt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; + cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.ip = ip; + cur->bc_ino.allocated = 0; + cur->bc_ino.flags = 0; + cur->bc_ino.whichfork = whichfork; return cur; } @@ -636,15 +636,12 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - if (whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); - else - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; - cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; + cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 29b407d053b4..72bf74c79fb9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index fd300dc93ca4..2d25bab68764 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" +#include "xfs_btree_staging.h" /* * Cursor allocation zone. @@ -214,7 +215,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); + return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno); } /* @@ -234,8 +235,8 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork, cur->bc_btnum, + cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork, cur->bc_btnum, level, index); } else { if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), @@ -243,7 +244,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.a.agno, cur->bc_btnum, + cur->bc_ag.agno, cur->bc_btnum, level, index); } @@ -378,10 +379,12 @@ xfs_btree_del_cursor( * allocated indirect blocks' accounting. */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_private.b.allocated == 0); + cur->bc_ino.allocated == 0); /* * Free the cursor. */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) + kmem_free((void *)cur->bc_ops); kmem_cache_free(xfs_btree_cur_zone, cur); } @@ -642,6 +645,17 @@ xfs_btree_ptr_addr( ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } +struct xfs_ifork * +xfs_btree_ifork_ptr( + struct xfs_btree_cur *cur) +{ + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + if (cur->bc_flags & XFS_BTREE_STAGING) + return cur->bc_ino.ifake->if_fork; + return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); +} + /* * Get the root block which is stored in the inode. * @@ -652,9 +666,8 @@ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); return (struct xfs_btree_block *)ifp->if_broot; } @@ -881,13 +894,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -945,7 +958,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno, agbno); } @@ -1014,7 +1027,7 @@ xfs_btree_ptr_is_null( return ptr->s == cpu_to_be32(NULLAGBLOCK); } -STATIC void +void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1050,7 +1063,7 @@ xfs_btree_get_sibling( } } -STATIC void +void xfs_btree_set_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -1128,7 +1141,7 @@ xfs_btree_init_block( btnum, level, numrecs, owner, 0); } -STATIC void +void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, @@ -1144,9 +1157,9 @@ xfs_btree_init_block_cur( * code. */ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_private.b.ip->i_ino; + owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_private.a.agno; + owner = cur->bc_ag.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, cur->bc_btnum, level, numrecs, @@ -1220,7 +1233,7 @@ xfs_btree_set_refs( } } -STATIC int +int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, @@ -1280,7 +1293,7 @@ xfs_btree_read_buf_block( /* * Copy keys from one btree block to another. */ -STATIC void +void xfs_btree_copy_keys( struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, @@ -1308,11 +1321,11 @@ xfs_btree_copy_recs( /* * Copy block pointers from one btree block to another. */ -STATIC void +void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, - union xfs_btree_ptr *src_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); @@ -1393,8 +1406,8 @@ xfs_btree_log_keys( xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1436,8 +1449,8 @@ xfs_btree_log_ptrs( xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1505,8 +1518,8 @@ xfs_btree_log_block( xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1743,10 +1756,10 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && - !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && + !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_private.b.ip->i_ino) + cur->bc_ino.ip->i_ino) goto out_bad; /* Did we get the level we were looking for? */ @@ -1762,7 +1775,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -2938,9 +2951,9 @@ xfs_btree_new_iroot( xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - xfs_iroot_realloc(cur->bc_private.b.ip, + xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), - cur->bc_private.b.whichfork); + cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); @@ -2953,7 +2966,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3105,11 +3118,11 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ @@ -3455,8 +3468,8 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_private.b.whichfork; - struct xfs_inode *ip = cur->bc_private.b.ip; + int whichfork = cur->bc_ino.whichfork; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; @@ -3514,8 +3527,8 @@ xfs_btree_kill_iroot( index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { - xfs_iroot_realloc(cur->bc_private.b.ip, index, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, index, + cur->bc_ino.whichfork); block = ifp->if_broot; } @@ -3544,7 +3557,7 @@ xfs_btree_kill_iroot( cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; @@ -3712,8 +3725,8 @@ xfs_btree_delrec( */ if (level == cur->bc_nlevels - 1) { if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, -1, + cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3eff7c321d43..10e50cbacacf 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_ifork; extern kmem_zone_t *xfs_btree_cur_zone; @@ -177,15 +178,37 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree private information. */ -union xfs_btree_cur_private { - struct { - unsigned long nr_ops; /* # record updates */ - int shape_changes; /* # of extent splits */ - } refc; - struct { - bool active; /* allocation cursor state */ - } abt; +/* Per-AG btree information. */ +struct xfs_btree_cur_ag { + union { + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + }; + xfs_agnumber_t agno; + union { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; + struct { + bool active; /* allocation cursor state */ + } abt; + }; +}; + +/* Btree-in-inode cursor information */ +struct xfs_btree_cur_ino { + struct xfs_inode *ip; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + int allocated; + short forksize; + char whichfork; + char flags; +/* We are converting a delalloc reservation */ +#define XFS_BTCUR_BMBT_WASDEL (1 << 0) + +/* For extent swap, ignore owner check in verifier */ +#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) }; /* @@ -209,21 +232,9 @@ typedef struct xfs_btree_cur xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { - struct { /* needed for BNO, CNT, INO */ - struct xfs_buf *agbp; /* agf/agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - union xfs_btree_cur_private priv; - } a; - struct { /* needed for BMAP */ - struct xfs_inode *ip; /* pointer to our inode */ - int allocated; /* count of alloced */ - short forksize; /* fork's inode space */ - char whichfork; /* data or attr fork */ - char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ -#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ - } b; - } bc_private; /* per-btree type data */ + struct xfs_btree_cur_ag bc_ag; + struct xfs_btree_cur_ino bc_ino; + }; } xfs_btree_cur_t; /* cursor flags */ @@ -232,6 +243,12 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* + * The root of this btree is a fakeroot structure so that we can stage a btree + * rebuild without leaving it accessible via primary metadata. The ops struct + * is dynamically allocated and must be freed when the cursor is deleted. + */ +#define XFS_BTREE_STAGING (1<<5) #define XFS_BTREE_NOERROR 0 @@ -494,6 +511,7 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); +struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Does this cursor point to the last block in the given level? */ static inline bool @@ -512,4 +530,20 @@ xfs_btree_islastblock( return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } +void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, struct xfs_buf **bpp); +void xfs_btree_set_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, union xfs_btree_ptr *ptr, + int lr); +void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, int numrecs); +void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs); +void xfs_btree_copy_keys(struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, union xfs_btree_key *src_key, + int numkeys); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c new file mode 100644 index 000000000000..f464a7c7cf22 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -0,0 +1,879 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_trace.h" +#include "xfs_btree_staging.h" + +/* + * Staging Cursors and Fake Roots for Btrees + * ========================================= + * + * A staging btree cursor is a special type of btree cursor that callers must + * use to construct a new btree index using the btree bulk loader code. The + * bulk loading code uses the staging btree cursor to abstract the details of + * initializing new btree blocks and filling them with records or key/ptr + * pairs. Regular btree operations (e.g. queries and modifications) are not + * supported with staging cursors, and callers must not invoke them. + * + * Fake root structures contain all the information about a btree that is under + * construction by the bulk loading code. Staging btree cursors point to fake + * root structures instead of the usual AG header or inode structure. + * + * Callers are expected to initialize a fake root structure and pass it into + * the _stage_cursor function for a specific btree type. When bulk loading is + * complete, callers should call the _commit_staged_btree function for that + * specific btree type to commit the new btree into the filesystem. + */ + +/* + * Don't allow staging cursors to be duplicated because they're supposed to be + * kept private to a single thread. + */ +STATIC struct xfs_btree_cur * +xfs_btree_fakeroot_dup_cursor( + struct xfs_btree_cur *cur) +{ + ASSERT(0); + return NULL; +} + +/* + * Don't allow block allocation for a staging cursor, because staging cursors + * do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ +STATIC int +xfs_btree_fakeroot_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Don't allow block freeing for a staging cursor, because staging cursors + * do not support regular btree modifications. + */ +STATIC int +xfs_btree_fakeroot_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* Initialize a pointer to the root block from the fakeroot. */ +STATIC void +xfs_btree_fakeroot_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xbtree_afakeroot *afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + afake = cur->bc_ag.afake; + ptr->s = cpu_to_be32(afake->af_root); +} + +/* + * Bulk Loading for AG Btrees + * ========================== + * + * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the + * staging cursor. Callers should initialize this to zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_afakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* Update the btree root information for a per-AG fake root. */ +STATIC void +xfs_btree_afakeroot_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + afake->af_root = be32_to_cpu(ptr->s); + afake->af_levels += inc; +} + +/* + * Initialize a AG-rooted btree cursor with the given AG btree fake root. + * The btree cursor's bc_ops will be overridden as needed to make the staging + * functionality work. + */ +void +xfs_btree_stage_afakeroot( + struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->set_root = xfs_btree_afakeroot_set_root; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ag.afake = afake; + cur->bc_nlevels = afake->af_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; +} + +/* + * Transform an AG-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_afakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_afakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ag.agbp = agbp; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading for Inode-Rooted Btrees + * ==================================== + * + * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to + * the staging cursor. This structure should be initialized as follows: + * + * - if_fork_size field should be set to the number of bytes available to the + * fork in the inode. + * + * - if_fork should point to a freshly allocated struct xfs_ifork. + * + * - if_format should be set to the appropriate fork type (e.g. + * XFS_DINODE_FMT_BTREE). + * + * All other fields must be zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_ifakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* + * Initialize an inode-rooted btree cursor with the given inode btree fake + * root. The btree cursor's bc_ops will be overridden as needed to make the + * staging functionality work. If new_ops is not NULL, these new ops will be + * passed out to the caller for further overriding. + */ +void +xfs_btree_stage_ifakeroot( + struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ino.ifake = ifake; + cur->bc_nlevels = ifake->if_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; + + if (new_ops) + *new_ops = nops; +} + +/* + * Transform an inode-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_ifakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_ifakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ino.ifake = NULL; + cur->bc_ino.whichfork = whichfork; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading of Staged Btrees + * ============================= + * + * This interface is used with a staged btree cursor to create a totally new + * btree with a large number of records (i.e. more than what would fit in a + * single root block). When the creation is complete, the new root can be + * linked atomically into the filesystem by committing the staged cursor. + * + * Creation of a new btree proceeds roughly as follows: + * + * The first step is to initialize an appropriate fake btree root structure and + * then construct a staged btree cursor. Refer to the block comments about + * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for + * more information about how to do this. + * + * The second step is to initialize a struct xfs_btree_bload context as + * documented in the structure definition. + * + * The third step is to call xfs_btree_bload_compute_geometry to compute the + * height of and the number of blocks needed to construct the btree. See the + * section "Computing the Geometry of the New Btree" for details about this + * computation. + * + * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and + * save them for later use by ->claim_block(). Bulk loading requires all + * blocks to be allocated beforehand to avoid ENOSPC failures midway through a + * rebuild, and to minimize seek distances of the new btree. + * + * Step five is to call xfs_btree_bload() to start constructing the btree. + * + * The final step is to commit the staging btree cursor, which logs the new + * btree root and turns the staging cursor into a regular cursor. The caller + * is responsible for cleaning up the previous btree blocks, if any. + * + * Computing the Geometry of the New Btree + * ======================================= + * + * The number of items placed in each btree block is computed via the following + * algorithm: For leaf levels, the number of items for the level is nr_records + * in the bload structure. For node levels, the number of items for the level + * is the number of blocks in the next lower level of the tree. For each + * level, the desired number of items per block is defined as: + * + * desired = max(minrecs, maxrecs - slack factor) + * + * The number of blocks for the level is defined to be: + * + * blocks = floor(nr_items / desired) + * + * Note this is rounded down so that the npb calculation below will never fall + * below minrecs. The number of items that will actually be loaded into each + * btree block is defined as: + * + * npb = nr_items / blocks + * + * Some of the leftmost blocks in the level will contain one extra record as + * needed to handle uneven division. If the number of records in any block + * would exceed maxrecs for that level, blocks is incremented and npb is + * recalculated. + * + * In other words, we compute the number of blocks needed to satisfy a given + * loading level, then spread the items as evenly as possible. + * + * The height and number of fs blocks required to create the btree are computed + * and returned via btree_height and nr_blocks. + */ + +/* + * Put a btree block that we're loading onto the ordered list and release it. + * The btree blocks will be written to disk when bulk loading is finished. + */ +static void +xfs_btree_bload_drop_buf( + struct list_head *buffers_list, + struct xfs_buf **bpp) +{ + if (*bpp == NULL) + return; + + if (!xfs_buf_delwri_queue(*bpp, buffers_list)) + ASSERT(0); + + xfs_buf_relse(*bpp); + *bpp = NULL; +} + +/* + * Allocate and initialize one btree block for bulk loading. + * + * The new btree block will have its level and numrecs fields set to the values + * of the level and nr_this_block parameters, respectively. + * + * The caller should ensure that ptrp, bpp, and blockp refer to the left + * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp + * will all point to the new block. + */ +STATIC int +xfs_btree_bload_prep_block( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + unsigned int level, + unsigned int nr_this_block, + union xfs_btree_ptr *ptrp, /* in/out */ + struct xfs_buf **bpp, /* in/out */ + struct xfs_btree_block **blockp, /* in/out */ + void *priv) +{ + union xfs_btree_ptr new_ptr; + struct xfs_buf *new_bp; + struct xfs_btree_block *new_block; + int ret; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + size_t new_size; + + ASSERT(*bpp == NULL); + + /* Allocate a new incore btree root block. */ + new_size = bbl->iroot_size(cur, nr_this_block, priv); + ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot_bytes = (int)new_size; + ifp->if_flags |= XFS_IFBROOT; + + /* Initialize it and send it out. */ + xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, + XFS_BUF_DADDR_NULL, cur->bc_btnum, level, + nr_this_block, cur->bc_ino.ip->i_ino, + cur->bc_flags); + + *bpp = NULL; + *blockp = ifp->if_broot; + xfs_btree_set_ptr_null(cur, ptrp); + return 0; + } + + /* Claim one of the caller's preallocated blocks. */ + xfs_btree_set_ptr_null(cur, &new_ptr); + ret = bbl->claim_block(cur, &new_ptr, priv); + if (ret) + return ret; + + ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); + + ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); + if (ret) + return ret; + + /* + * The previous block (if any) is the left sibling of the new block, + * so set its right sibling pointer to the new block and drop it. + */ + if (*blockp) + xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); + xfs_btree_bload_drop_buf(buffers_list, bpp); + + /* Initialize the new btree block. */ + xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); + xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); + + /* Set the out parameters. */ + *bpp = new_bp; + *blockp = new_block; + xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); + return 0; +} + +/* Load one leaf block. */ +STATIC int +xfs_btree_bload_leaf( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + xfs_btree_bload_get_record_fn get_record, + struct xfs_btree_block *block, + void *priv) +{ + unsigned int j; + int ret; + + /* Fill the leaf block with records. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_rec *block_rec; + + ret = get_record(cur, priv); + if (ret) + return ret; + block_rec = xfs_btree_rec_addr(cur, j, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return 0; +} + +/* + * Load one node block with key/ptr pairs. + * + * child_ptr must point to a block within the next level down in the tree. A + * key/ptr entry will be created in the new node block to the block pointed to + * by child_ptr. On exit, child_ptr points to the next block on the child + * level that needs processing. + */ +STATIC int +xfs_btree_bload_node( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + union xfs_btree_ptr *child_ptr, + struct xfs_btree_block *block) +{ + unsigned int j; + int ret; + + /* Fill the node block with keys and pointers. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_key child_key; + union xfs_btree_ptr *block_ptr; + union xfs_btree_key *block_key; + struct xfs_btree_block *child_block; + struct xfs_buf *child_bp; + + ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); + + ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + &child_bp); + if (ret) + return ret; + + block_ptr = xfs_btree_ptr_addr(cur, j, block); + xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); + + block_key = xfs_btree_key_addr(cur, j, block); + xfs_btree_get_keys(cur, child_block, &child_key); + xfs_btree_copy_keys(cur, block_key, &child_key, 1); + + xfs_btree_get_sibling(cur, child_block, child_ptr, + XFS_BB_RIGHTSIB); + xfs_buf_relse(child_bp); + } + + return 0; +} + +/* + * Compute the maximum number of records (or keyptrs) per block that we want to + * install at this level in the btree. Caller is responsible for having set + * @cur->bc_ino.forksize to the desired fork size, if appropriate. + */ +STATIC unsigned int +xfs_btree_bload_max_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int ret; + + if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) + return cur->bc_ops->get_dmaxrecs(cur, level); + + ret = cur->bc_ops->get_maxrecs(cur, level); + if (level == 0) + ret -= bbl->leaf_slack; + else + ret -= bbl->node_slack; + return ret; +} + +/* + * Compute the desired number of records (or keyptrs) per block that we want to + * install at this level in the btree, which must be somewhere between minrecs + * and max_npb. The caller is free to install fewer records per block. + */ +STATIC unsigned int +xfs_btree_bload_desired_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); + + /* Root blocks are not subject to minrecs rules. */ + if (level == cur->bc_nlevels - 1) + return max(1U, npb); + + return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); +} + +/* + * Compute the number of records to be stored in each block at this level and + * the number of blocks for this level. For leaf levels, we must populate an + * empty root block even if there are no records, so we have to have at least + * one block. + */ +STATIC void +xfs_btree_bload_level_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level, + uint64_t nr_this_level, + unsigned int *avg_per_block, + uint64_t *blocks, + uint64_t *blocks_with_extra) +{ + uint64_t npb; + uint64_t dontcare; + unsigned int desired_npb; + unsigned int maxnr; + + maxnr = cur->bc_ops->get_maxrecs(cur, level); + + /* + * Compute the number of blocks we need to fill each block with the + * desired number of records/keyptrs per block. Because desired_npb + * could be minrecs, we use regular integer division (which rounds + * the block count down) so that in the next step the effective # of + * items per block will never be less than desired_npb. + */ + desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); + *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); + *blocks = max(1ULL, *blocks); + + /* + * Compute the number of records that we will actually put in each + * block, assuming that we want to spread the records evenly between + * the blocks. Take care that the effective # of items per block (npb) + * won't exceed maxrecs even for the blocks that get an extra record, + * since desired_npb could be maxrecs, and in the previous step we + * rounded the block count down. + */ + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { + (*blocks)++; + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + } + + *avg_per_block = min_t(uint64_t, npb, nr_this_level); + + trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, + *avg_per_block, desired_npb, *blocks, + *blocks_with_extra); +} + +/* + * Ensure a slack value is appropriate for the btree. + * + * If the slack value is negative, set slack so that we fill the block to + * halfway between minrecs and maxrecs. Make sure the slack is never so large + * that we can underflow minrecs. + */ +static void +xfs_btree_bload_ensure_slack( + struct xfs_btree_cur *cur, + int *slack, + int level) +{ + int maxr; + int minr; + + maxr = cur->bc_ops->get_maxrecs(cur, level); + minr = cur->bc_ops->get_minrecs(cur, level); + + /* + * If slack is negative, automatically set slack so that we load the + * btree block approximately halfway between minrecs and maxrecs. + * Generally, this will net us 75% loading. + */ + if (*slack < 0) + *slack = maxr - ((maxr + minr) >> 1); + + *slack = min(*slack, maxr - minr); +} + +/* + * Prepare a btree cursor for a bulk load operation by computing the geometry + * fields in bbl. Caller must ensure that the btree cursor is a staging + * cursor. This function can be called multiple times. + */ +int +xfs_btree_bload_compute_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + uint64_t nr_records) +{ + uint64_t nr_blocks = 0; + uint64_t nr_this_level; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + /* + * Make sure that the slack values make sense for traditional leaf and + * node blocks. Inode-rooted btrees will return different minrecs and + * maxrecs values for the root block (bc_nlevels == level - 1). We're + * checking levels 0 and 1 here, so set bc_nlevels such that the btree + * code doesn't interpret either as the root level. + */ + cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1; + xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); + xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); + + bbl->nr_records = nr_this_level = nr_records; + for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { + uint64_t level_blocks; + uint64_t dontcare64; + unsigned int level = cur->bc_nlevels - 1; + unsigned int avg_per_block; + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &level_blocks, &dontcare64); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * If all the items we want to store at this level + * would fit in the inode root block, then we have our + * btree root and are done. + * + * Note that bmap btrees forbid records in the root. + */ + if (level != 0 && nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* + * Otherwise, we have to store all the items for this + * level in traditional btree blocks and therefore need + * another level of btree to point to those blocks. + * + * We have to re-compute the geometry for each level of + * an inode-rooted btree because the geometry differs + * between a btree root in an inode fork and a + * traditional btree block. + * + * This distinction is made in the btree code based on + * whether level == bc_nlevels - 1. Based on the + * previous root block size check against the root + * block geometry, we know that we aren't yet ready to + * populate the root. Increment bc_nevels and + * recalculate the geometry for a traditional + * block-based btree level. + */ + cur->bc_nlevels++; + xfs_btree_bload_level_geometry(cur, bbl, level, + nr_this_level, &avg_per_block, + &level_blocks, &dontcare64); + } else { + /* + * If all the items we want to store at this level + * would fit in a single root block, we're done. + */ + if (nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* Otherwise, we need another level of btree. */ + cur->bc_nlevels++; + } + + nr_blocks += level_blocks; + nr_this_level = level_blocks; + } + + if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) + return -EOVERFLOW; + + bbl->btree_height = cur->bc_nlevels; + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + bbl->nr_blocks = nr_blocks - 1; + else + bbl->nr_blocks = nr_blocks; + return 0; +} + +/* Bulk load a btree given the parameters and geometry established in bbl. */ +int +xfs_btree_bload( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + void *priv) +{ + struct list_head buffers_list; + union xfs_btree_ptr child_ptr; + union xfs_btree_ptr ptr; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block = NULL; + uint64_t nr_this_level = bbl->nr_records; + uint64_t blocks; + uint64_t i; + uint64_t blocks_with_extra; + uint64_t total_blocks = 0; + unsigned int avg_per_block; + unsigned int level = 0; + int ret; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + INIT_LIST_HEAD(&buffers_list); + cur->bc_nlevels = bbl->btree_height; + xfs_btree_set_ptr_null(cur, &child_ptr); + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each leaf block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + /* + * Due to rounding, btree blocks will not be evenly populated + * in most cases. blocks_with_extra tells us how many blocks + * will receive an extra record to distribute the excess across + * the current level as evenly as possible. + */ + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, + nr_this_block, &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, + nr_this_block); + + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + block, priv); + if (ret) + goto out; + + /* + * Record the leftmost leaf pointer so we know where to start + * with the first node level. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + + /* Populate the internal btree nodes. */ + for (level = 1; level < cur->bc_nlevels; level++) { + union xfs_btree_ptr first_ptr; + + nr_this_level = blocks; + block = NULL; + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each node block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, + &buffers_list, level, nr_this_block, + &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, + &ptr, nr_this_block); + + ret = xfs_btree_bload_node(cur, nr_this_block, + &child_ptr, block); + if (ret) + goto out; + + /* + * Record the leftmost node pointer so that we know + * where to start the next node level above this one. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); + } + + /* Initialize the new root. */ + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + cur->bc_ino.ifake->if_levels = cur->bc_nlevels; + cur->bc_ino.ifake->if_blocks = total_blocks - 1; + } else { + cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); + cur->bc_ag.afake->af_levels = cur->bc_nlevels; + cur->bc_ag.afake->af_blocks = total_blocks; + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + ret = xfs_buf_delwri_submit(&buffers_list); + if (ret) + goto out; + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + ret = -EIO; + } + +out: + xfs_buf_delwri_cancel(&buffers_list); + if (bp) + xfs_buf_relse(bp); + return ret; +} diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h new file mode 100644 index 000000000000..f0d2976050ae --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_BTREE_STAGING_H__ +#define __XFS_BTREE_STAGING_H__ + +/* Fake root for an AG-rooted btree. */ +struct xbtree_afakeroot { + /* AG block number of the new btree root. */ + xfs_agblock_t af_root; + + /* Height of the new btree. */ + unsigned int af_levels; + + /* Number of blocks used by the btree. */ + unsigned int af_blocks; +}; + +/* Cursor interactions with fake roots for AG-rooted btrees. */ +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake); +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + +/* Fake root for an inode-rooted btree. */ +struct xbtree_ifakeroot { + /* Fake inode fork. */ + struct xfs_ifork *if_fork; + + /* Number of blocks used by the btree. */ + int64_t if_blocks; + + /* Height of the new btree. */ + unsigned int if_levels; + + /* Number of bytes available for this fork in the inode. */ + unsigned int if_fork_size; + + /* Fork format. */ + unsigned int if_format; + + /* Number of records. */ + unsigned int if_extents; +}; + +/* Cursor interactions with fake roots for inode-rooted btrees. */ +void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops); +void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + int whichfork, const struct xfs_btree_ops *ops); + +/* Bulk loading of staged btrees. */ +typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, void *priv); +typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, + unsigned int nr_this_level, void *priv); + +struct xfs_btree_bload { + /* + * This function will be called nr_records times to load records into + * the btree. The function does this by setting the cursor's bc_rec + * field in in-core format. Records must be returned in sort order. + */ + xfs_btree_bload_get_record_fn get_record; + + /* + * This function will be called nr_blocks times to obtain a pointer + * to a new btree block on disk. Callers must preallocate all space + * for the new btree before calling xfs_btree_bload, and this function + * is what claims that reservation. + */ + xfs_btree_bload_claim_block_fn claim_block; + + /* + * This function should return the size of the in-core btree root + * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree + * types. + */ + xfs_btree_bload_iroot_size_fn iroot_size; + + /* + * The caller should set this to the number of records that will be + * stored in the new btree. + */ + uint64_t nr_records; + + /* + * Number of free records to leave in each leaf block. If the caller + * sets this to -1, the slack value will be calculated to be halfway + * between maxrecs and minrecs. This typically leaves the block 75% + * full. Note that slack values are not enforced on inode root blocks. + */ + int leaf_slack; + + /* + * Number of free key/ptrs pairs to leave in each node block. This + * field has the same semantics as leaf_slack. + */ + int node_slack; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * number of btree blocks needed to store nr_records records. + */ + uint64_t nr_blocks; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * height of the new btree. + */ + unsigned int btree_height; +}; + +int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, uint64_t nr_records); +int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl, + void *priv); + +#endif /* __XFS_BTREE_STAGING_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 875e04f82541..e46bc03365db 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -78,10 +78,16 @@ kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ * Allocate a dir-state structure. * We don't put them on the stack since they're large. */ -xfs_da_state_t * -xfs_da_state_alloc(void) +struct xfs_da_state * +xfs_da_state_alloc( + struct xfs_da_args *args) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); + struct xfs_da_state *state; + + state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); + state->args = args; + state->mp = args->dp->i_mount; + return state; } /* @@ -590,7 +596,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -603,7 +609,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -1624,7 +1630,7 @@ xfs_da3_node_lookup_int( } if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1639,7 +1645,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1647,7 +1653,7 @@ xfs_da3_node_lookup_int( if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } else expected_level--; @@ -1986,7 +1992,8 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { + for (; level >= 0; level--) { + blk = &path->blk[level]; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, blk->bp->b_addr); @@ -2520,8 +2527,10 @@ xfs_dabuf_map( */ if (nirecs > 1) { map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); - if (!map) + if (!map) { + error = -ENOMEM; goto out_free_irecs; + } *mapp = map; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0f4fbb0889ff..ad5dd324631a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. @@ -57,9 +57,10 @@ typedef struct xfs_da_args { const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ uint8_t filetype; /* filetype of inode for directories */ - uint8_t *value; /* set of bytes (maybe contain NULLs) */ + void *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ - int flags; /* argument flags (eg: ATTR_NOCREATE) */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ xfs_dahash_t hashval; /* hash value of name */ xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ @@ -88,8 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ -#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ +#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -97,8 +97,7 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ - { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } + { XFS_DA_OP_NOTIME, "NOTIME" } /* * Storage for holding state during Btree searches and split/join ops. @@ -220,7 +219,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const unsigned char *name, int len); -xfs_da_state_t *xfs_da_state_alloc(void); +struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 734837a9b51a..059ac108b1b3 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. @@ -692,19 +692,7 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) /* * Alignment for namelist and valuelist entries (since they are mixed diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 22557527cfdb..d8f586256add 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -178,6 +178,18 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, }; +static void +xfs_defer_create_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp, + bool sort) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); +} + /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of @@ -187,17 +199,11 @@ STATIC void xfs_defer_create_intents( struct xfs_trans *tp) { - struct list_head *li; struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + xfs_defer_create_intent(tp, dfp, true); } } @@ -234,10 +240,13 @@ xfs_defer_trans_roll( struct xfs_log_item *lip; struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS]; struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES]; + unsigned int ordered = 0; /* bitmap */ int bpcount = 0, ipcount = 0; int i; int error; + BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS); + list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { case XFS_LI_BUF: @@ -248,7 +257,10 @@ xfs_defer_trans_roll( ASSERT(0); return -EFSCORRUPTED; } - xfs_trans_dirty_buf(tp, bli->bli_buf); + if (bli->bli_flags & XFS_BLI_ORDERED) + ordered |= (1U << bpcount); + else + xfs_trans_dirty_buf(tp, bli->bli_buf); bplist[bpcount++] = bli->bli_buf; } break; @@ -289,6 +301,8 @@ xfs_defer_trans_roll( /* Rejoin the buffers and dirty them so the log moves forward. */ for (i = 0; i < bpcount; i++) { xfs_trans_bjoin(tp, bplist[i]); + if (ordered & (1U << i)) + xfs_trans_ordered_buf(tp, bplist[i]); xfs_trans_bhold(tp, bplist[i]); } @@ -346,6 +360,53 @@ xfs_defer_cancel_list( } /* + * Log an intent-done item for the first pending intent, and finish the work + * items. + */ +static int +xfs_defer_finish_one( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_btree_cur *state = NULL; + struct list_head *li, *n; + int error; + + trace_xfs_defer_pending_finish(tp->t_mountp, dfp); + + dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = ops->finish_item(tp, dfp->dfp_done, li, &state); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; put the work item + * back on the list and log a new log intent item to + * replace the old one. See "Requesting a Fresh + * Transaction while Finishing Deferred Work" above. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + dfp->dfp_done = NULL; + xfs_defer_create_intent(tp, dfp, false); + } + + if (error) + goto out; + } + + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); +out: + if (ops->finish_cleanup) + ops->finish_cleanup(tp, state, error); + return error; +} + +/* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if * one has even happened), rolling the transaction, and finishing the @@ -358,11 +419,7 @@ xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp; - struct list_head *li; - struct list_head *n; - void *state; int error = 0; - const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -371,87 +428,30 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { - /* log intents and pull in intake items */ xfs_defer_create_intents(*tp); list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); - /* - * Roll the transaction. - */ error = xfs_defer_trans_roll(tp); if (error) - goto out; + goto out_shutdown; - /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, - dfp->dfp_count); - - /* Finish the work items. */ - state = NULL; - list_for_each_safe(li, n, &dfp->dfp_work) { - list_del(li); - dfp->dfp_count--; - error = ops->finish_item(*tp, li, dfp->dfp_done, - &state); - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction; - * put the work item back on the list - * and jump out. - */ - list_add(li, &dfp->dfp_work); - dfp->dfp_count++; - break; - } else if (error) { - /* - * Clean up after ourselves and jump out. - * xfs_defer_cancel will take care of freeing - * all these lists and stuff. - */ - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - goto out; - } - } - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction, so log a - * new log intent item to replace the old one - * and roll the transaction. See "Requesting - * a Fresh Transaction while Finishing - * Deferred Work" above. - */ - dfp->dfp_intent = ops->create_intent(*tp, - dfp->dfp_count); - dfp->dfp_done = NULL; - list_for_each(li, &dfp->dfp_work) - ops->log_item(*tp, dfp->dfp_intent, li); - } else { - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); - } - - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - } - -out: - if (error) { - xfs_defer_trans_abort(*tp, &dop_pending); - xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); - trace_xfs_defer_finish_error(*tp, error); - xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); - xfs_defer_cancel(*tp); - return error; + error = xfs_defer_finish_one(*tp, dfp); + if (error && error != -EAGAIN) + goto out_shutdown; } trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; + +out_shutdown: + xfs_defer_trans_abort(*tp, &dop_pending); + xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); + trace_xfs_defer_finish_error(*tp, error); + xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); + xfs_defer_cancel(*tp); + return error; } int diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7c28d7608ac6..6b2ca580f2b0 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> @@ -6,6 +6,7 @@ #ifndef __XFS_DEFER_H__ #define __XFS_DEFER_H__ +struct xfs_btree_cur; struct xfs_defer_op_type; /* @@ -28,8 +29,8 @@ enum xfs_defer_ops_type { struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ - void *dfp_intent; /* log intent item */ - void *dfp_done; /* log done item */ + struct xfs_log_item *dfp_intent; /* log intent item */ + struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ enum xfs_defer_ops_type dfp_type; }; @@ -43,15 +44,16 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - void (*abort_intent)(void *); - void *(*create_done)(struct xfs_trans *, void *, unsigned int); - int (*finish_item)(struct xfs_trans *, struct list_head *, void *, - void **); - void (*finish_cleanup)(struct xfs_trans *, void *, int); - void (*cancel_item)(struct list_head *); - int (*diff_items)(void *, struct list_head *, struct list_head *); - void *(*create_intent)(struct xfs_trans *, uint); - void (*log_item)(struct xfs_trans *, void *, struct list_head *); + struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, + struct list_head *items, unsigned int count, bool sort); + void (*abort_intent)(struct xfs_log_item *intent); + struct xfs_log_item *(*create_done)(struct xfs_trans *tp, + struct xfs_log_item *intent, unsigned int count); + int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done, + struct list_head *item, struct xfs_btree_cur **state); + void (*finish_cleanup)(struct xfs_trans *tp, + struct xfs_btree_cur *state, int error); + void (*cancel_item)(struct list_head *item); unsigned int max_items; }; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index dd6fcaaea318..612a9c5e41b1 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -278,7 +278,7 @@ xfs_dir_createname( if (!inum) args->op_flags |= XFS_DA_OP_JUSTCHECK; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); goto out_free; } @@ -373,7 +373,7 @@ xfs_dir_lookup( args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; } @@ -443,7 +443,7 @@ xfs_dir_removename( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_removename(args); goto out_free; } @@ -504,7 +504,7 @@ xfs_dir_replace( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_replace(args); goto out_free; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 033777e282f2..e55378640b05 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index d6ced59b9567..5b59d3f7746b 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; +static xfs_failaddr_t +xfs_dir3_block_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} + int xfs_dir3_block_read( struct xfs_trans *tp, @@ -121,12 +138,24 @@ xfs_dir3_block_read( struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_block_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } @@ -1075,7 +1104,7 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_bytes == dp->i_d.di_size); ASSERT(ifp->if_u1.if_data != NULL); ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); - ASSERT(dp->i_d.di_nextents == 0); + ASSERT(dp->i_df.if_nextents == 0); /* * Copy the directory into a temporary buffer. diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index b9eba8213180..375b3edb2ad2 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -394,6 +394,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; +static xfs_failaddr_t +xfs_dir3_data_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} int xfs_dir3_data_read( @@ -403,12 +419,24 @@ xfs_dir3_data_read( unsigned int flags, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_data_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a131b520aac7..95d2a3f92d75 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -1383,7 +1383,7 @@ xfs_dir2_leaf_removename( ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index a0cc5e240306..5d51265d29d6 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -194,6 +194,8 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -226,8 +228,9 @@ __xfs_dir3_free_read( /* Check things that we can't do in the verifier. */ fa = xfs_dir3_free_header_check(dp, fbno, *bpp); if (fa) { - xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); + __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); + *bpp = NULL; return -EFSCORRUPTED; } @@ -439,7 +442,7 @@ xfs_dir2_leaf_to_node( ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > (uint)dp->i_d.di_size / args->geo->blksize) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } @@ -513,7 +516,7 @@ xfs_dir2_leafn_add( * into other peoples memory */ if (index < 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -800,7 +803,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2012,9 +2015,7 @@ xfs_dir2_node_addname( /* * Allocate and initialize the state (btree cursor). */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. @@ -2083,9 +2084,8 @@ xfs_dir2_node_lookup( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); + /* * Fill in the path to the entry in the cursor. */ @@ -2136,9 +2136,7 @@ xfs_dir2_node_removename( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); @@ -2203,9 +2201,7 @@ xfs_dir2_node_replace( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * We have to save new inode number and ftype since diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 01ee0b926572..44c6a77cba05 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 7b7f6fb2ea3b..2463b5d73447 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -343,7 +343,7 @@ xfs_dir2_block_to_sf( */ ASSERT(dp->i_df.if_bytes == 0); xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; @@ -710,11 +710,11 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; @@ -723,9 +723,8 @@ xfs_dir2_sf_verify( int error; uint8_t filetype; - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -827,9 +826,9 @@ xfs_dir2_sf_create( * If it's currently a zero-length extent file, * convert it to local format. */ - if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) { dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); dp->i_df.if_flags |= XFS_IFINLINE; } @@ -1027,7 +1026,7 @@ xfs_dir2_sf_replace_needblock( int newsize; struct xfs_dir2_sf_hdr *sfp; - if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index bedc1e752b60..5a2db00b9d5f 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -37,9 +37,10 @@ xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, struct xfs_disk_dquot *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { + __u8 ddq_type; + /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got @@ -60,11 +61,12 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (type && ddq->d_flags != type) + if (ddq->d_type & ~XFS_DQTYPE_ANY) return __this_address; - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) + ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK; + if (ddq_type != XFS_DQTYPE_USER && + ddq_type != XFS_DQTYPE_PROJ && + ddq_type != XFS_DQTYPE_GROUP) return __this_address; if (id != -1 && id != be32_to_cpu(ddq->d_id)) @@ -95,14 +97,13 @@ xfs_failaddr_t xfs_dqblk_verify( struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { if (xfs_sb_version_hascrc(&mp->m_sb) && !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id); } /* @@ -113,7 +114,7 @@ xfs_dqblk_repair( struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { /* * Typically, a repair is only requested by quotacheck. @@ -123,7 +124,7 @@ xfs_dqblk_repair( dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; - dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_type = type; dqb->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { @@ -205,7 +206,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + fa = xfs_dqblk_verify(mp, &dqb[i], id + i); if (fa) { if (!readahead) xfs_buf_verifier_error(bp, -EFSCORRUPTED, diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 79e6c4fb1d8a..53b305dea381 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (C) 2017 Oracle. @@ -55,7 +55,8 @@ #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 #define XFS_ERRTAG_IUNLINK_FALLBACK 34 -#define XFS_ERRTAG_MAX 35 +#define XFS_ERRTAG_BUF_IOERROR 35 +#define XFS_ERRTAG_MAX 36 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -95,5 +96,6 @@ #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 77e9fa385980..31b7ece985bb 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } +/* + * v5 file systems support V3 inodes only, earlier file systems support + * v2 and v1 inodes. + */ +static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, + uint8_t version) +{ + if (xfs_sb_version_has_v3inode(sbp)) + return version == 3; + return version == 1 || version == 2; +} + static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; @@ -560,7 +577,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ @@ -707,7 +723,6 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) /* * Size of the unlinked inode hash table in the agi. @@ -775,7 +790,6 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) /* * The third a.g. block contains the a.g. freelist, an array @@ -783,21 +797,15 @@ typedef struct xfs_agi { */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) - -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) +#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr)) -typedef struct xfs_agfl { +struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ -} __attribute__((packed)) xfs_agfl_t; +} __attribute__((packed)); #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) @@ -946,23 +954,22 @@ enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) +#define XFS_DINODE_SIZE(sbp) \ + (xfs_sb_version_has_v3inode(sbp) ? \ + sizeof(struct xfs_dinode) : \ + offsetof(struct xfs_dinode, di_crc)) +#define XFS_LITINO(mp) \ + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) + ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) + ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ @@ -1142,16 +1149,26 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ +#define XFS_DQTYPE_USER 0x01 /* user dquot record */ +#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ +#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ + +/* bitmask to determine if this is a user/group/project dquot */ +#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ + XFS_DQTYPE_PROJ | \ + XFS_DQTYPE_GROUP) + +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) + /* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the struct xfs_dquot that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. + * This is the main portion of the on-disk representation of quota information + * for a user. We pad this with some more expansion room to construct the on + * disk structure. */ struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */ __be32 d_id; /* user,project,group id */ __be64 d_blk_hardlimit;/* absolute limit on disk blks */ __be64 d_blk_softlimit;/* preferred limit on disk blks */ @@ -1192,6 +1209,22 @@ typedef struct xfs_dqblk { #define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) /* + * This defines the unit of allocation of dquots. + * + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * + * However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + * + * This is part of the ondisk format because the structure size is not a power + * of two, which leaves slack at the end of the disk block. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +/* * Remote symlink format and access functions. */ #define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ @@ -1673,7 +1706,7 @@ struct xfs_acl_entry { struct xfs_acl { __be32 acl_cnt; - struct xfs_acl_entry acl_entry[0]; + struct xfs_acl_entry acl_entry[]; }; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ef95ca07d084..84bcffa87753 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +/* SPDX-License-Identifier: LGPL-2.1 */ /* * Copyright (c) 1995-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -568,10 +568,40 @@ typedef struct xfs_fsop_setdm_handlereq { struct fsdmidata __user *data; /* DMAPI data */ } xfs_fsop_setdm_handlereq_t; +/* + * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. + * + * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix. + */ +#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */ +#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */ +#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */ + typedef struct xfs_attrlist_cursor { __u32 opaque[4]; } xfs_attrlist_cursor_t; +/* + * Define how lists of attribute names are returned to userspace from the + * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the + * beginning of the returned buffer, and a each entry in al_offset contains the + * relative offset of an xfs_attrlist_ent containing the actual entry. + * + * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and + * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr. + */ +struct xfs_attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +}; + +struct xfs_attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +}; + typedef struct xfs_fsop_attrlist_handlereq { struct xfs_fsop_handlereq hreq; /* handle interface structure */ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ @@ -589,7 +619,7 @@ typedef struct xfs_attr_multiop { void __user *am_attrname; void __user *am_attrvalue; __u32 am_length; - __u32 am_flags; + __u32 am_flags; /* XFS_IOC_ATTR_* */ } xfs_attr_multiop_t; typedef struct xfs_fsop_attrmulti_handlereq { diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 272005ac8c88..99e796256c5d 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index bf161e930f1d..f742a96a2fe1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,7 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -177,7 +177,7 @@ xfs_inobt_insert( xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; @@ -304,7 +304,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -339,7 +339,7 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = xfs_dinode_size(version); + uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -525,7 +525,7 @@ xfs_inobt_insert_sprec( bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; @@ -658,7 +658,7 @@ xfs_ialloc_ag_alloc( * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ - agi = XFS_BUF_TO_AGI(agbp); + agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + @@ -888,10 +888,9 @@ sparse_alloc: */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - pag = xfs_perag_get(args.mp, agno); + pag = agbp->b_pag; pag->pagi_freecount += newlen; pag->pagi_count += newlen; - xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); /* @@ -1130,11 +1129,11 @@ xfs_dialloc_ag_inobt( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; + struct xfs_perag *pag = agbp->b_pag; struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; @@ -1143,8 +1142,6 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - pag = xfs_perag_get(mp, agno); - ASSERT(pag->pagi_init); ASSERT(pag->pagi_inodeok); ASSERT(pag->pagi_freecount > 0); @@ -1384,14 +1381,12 @@ alloc_inode: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1583,11 +1578,10 @@ xfs_dialloc_ag( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; @@ -1599,8 +1593,6 @@ xfs_dialloc_ag( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); - pag = xfs_perag_get(mp, agno); - /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1667,7 +1659,7 @@ xfs_dialloc_ag( */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag->pagi_freecount--; + agbp->b_pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); @@ -1680,7 +1672,6 @@ xfs_dialloc_ag( xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_perag_put(pag); *inop = ino; return 0; @@ -1688,7 +1679,6 @@ error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1943,9 +1933,8 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); - struct xfs_perag *pag; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; @@ -2007,6 +1996,8 @@ xfs_difree_inobt( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + struct xfs_perag *pag = agbp->b_pag; + xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); @@ -2020,10 +2011,8 @@ xfs_difree_inobt( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; - xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -2049,9 +2038,7 @@ xfs_difree_inobt( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); - pag->pagi_freecount++; - xfs_perag_put(pag); + agbp->b_pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -2079,7 +2066,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2489,9 +2476,8 @@ xfs_ialloc_log_agi( sizeof(xfs_agi_t) }; #ifdef DEBUG - xfs_agi_t *agi; /* allocation group header */ + struct xfs_agi *agi = bp->b_addr; - agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif @@ -2523,14 +2509,13 @@ xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int i; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } @@ -2593,6 +2578,7 @@ xfs_agi_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); @@ -2605,7 +2591,7 @@ xfs_agi_write_verify( return; if (bip) - XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } @@ -2661,8 +2647,8 @@ xfs_ialloc_read_agi( if (error) return error; - agi = XFS_BUF_TO_AGI(*bpp); - pag = xfs_perag_get(mp, agno); + agi = (*bpp)->b_addr; + pag = (*bpp)->b_pag; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2675,7 +2661,6 @@ xfs_ialloc_read_agi( */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || XFS_FORCED_SHUTDOWN(mp)); - xfs_perag_put(pag); return 0; } @@ -2873,7 +2858,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b82992f795aa..3c8aebc36e64 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" @@ -20,7 +21,6 @@ #include "xfs_trans.h" #include "xfs_rmap.h" - STATIC int xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, @@ -34,7 +34,7 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -44,8 +44,8 @@ xfs_inobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_root = nptr->s; be32_add_cpu(&agi->agi_level, inc); @@ -58,8 +58,8 @@ xfs_finobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_free_root = nptr->s; be32_add_cpu(&agi->agi_free_level, inc); @@ -83,7 +83,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -212,9 +212,9 @@ xfs_inobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -224,9 +224,9 @@ xfs_finobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } @@ -400,32 +400,27 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Allocate a new inode btree cursor. + * Initialize a new inode btree cursor. */ -struct xfs_btree_cur * /* new inode btree cursor */ -xfs_inobt_init_cursor( +static struct xfs_btree_cur * +xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agi structure */ xfs_agnumber_t agno, /* allocation group number */ xfs_btnum_t btnum) /* ialloc or free ino btree */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; if (btnum == XFS_BTNUM_INO) { - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_ops = &xfs_inobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); + cur->bc_ops = &xfs_inobt_ops; } else { - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); - cur->bc_ops = &xfs_finobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); + cur->bc_ops = &xfs_finobt_ops; } cur->bc_blocklog = mp->m_sb.sb_blocklog; @@ -433,12 +428,75 @@ xfs_inobt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; + return cur; +} + +/* Create an inode btree cursor. */ +struct xfs_btree_cur * +xfs_inobt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = agbp->b_addr; + cur = xfs_inobt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_INO) + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + else + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create an inode btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_inobt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_inobt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new inobt btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_inobt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agi *agi = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + if (cur->bc_btnum == XFS_BTNUM_INO) { + agi->agi_root = cpu_to_be32(afake->af_root); + agi->agi_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + } else { + agi->agi_free_root = cpu_to_be32(afake->af_root); + agi->agi_free_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | + XFS_AGI_FREE_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + } +} + /* * Calculate number of records in an inobt btree block. */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 951305ecaae1..35bbd978c272 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -48,6 +48,9 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -68,4 +71,7 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8afacfe4be0a..8d5dd08eab75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -21,41 +21,6 @@ #include <linux/iversion.h> /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. - */ -#if defined(DEBUG) -void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) -{ - int i; - xfs_dinode_t *dip; - - for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { - dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", - i, (long long)bp->b_bn); - } - } -} -#endif - -bool -xfs_dinode_good_version( - struct xfs_mount *mp, - __u8 version) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return version == 3; - - return version == 1 || version == 2; -} - -/* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence * has not had the inode cores stamped into it. Hence for readahead, the buffer @@ -64,10 +29,10 @@ xfs_dinode_good_version( * If the readahead buffer is invalid, we need to mark it with an error and * clear the DONE status of the buffer so that a followup read will re-read it * from disk. We don't report the error otherwise to avoid warnings during log - * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * recovery and we don't get unnecessary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. - * Changes to this readahead error behavour also need to be reflected in + * Changes to this readahead error behaviour also need to be reflected in * xfs_dquot_buf_readahead_verify(). */ static void @@ -93,7 +58,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(mp, dip->di_version) && + xfs_dinode_good_version(&mp->m_sb, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -172,8 +137,7 @@ xfs_imap_to_bp( struct xfs_imap *imap, struct xfs_dinode **dipp, struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) + uint buf_flags) { struct xfs_buf *bp; int error; @@ -183,48 +147,63 @@ xfs_imap_to_bp( (int)imap->im_len, buf_flags, &bp, &xfs_inode_buf_ops); if (error) { - if (error == -EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; - } - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); + ASSERT(error != -EAGAIN || (buf_flags & XBF_TRYLOCK)); return error; } *bpp = bp; - *dipp = xfs_buf_offset(bp, imap->im_boffset); + if (dipp) + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } -void +int xfs_inode_from_disk( struct xfs_inode *ip, struct xfs_dinode *from) { struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); + int error; + xfs_failaddr_t fa; + + ASSERT(ip->i_cowfp == NULL); + ASSERT(ip->i_afp == NULL); + + fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from, + sizeof(*from), fa); + return -EFSCORRUPTED; + } + /* + * First get the permanent information that is needed to allocate an + * inode. If the inode is unused, mode is zero and we shouldn't mess + * with the uninitialized part of it. + */ + to->di_flushiter = be16_to_cpu(from->di_flushiter); + inode->i_generation = be32_to_cpu(from->di_gen); + inode->i_mode = be16_to_cpu(from->di_mode); + if (!inode->i_mode) + return 0; /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. + * They will also be unconditionally written back to disk as v2 inodes. */ - to->di_version = from->di_version; - if (to->di_version == 1) { + if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); to->di_projid = 0; - to->di_version = 2; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); } - to->di_format = from->di_format; - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); - to->di_flushiter = be16_to_cpu(from->di_flushiter); + i_uid_write(inode, be32_to_cpu(from->di_uid)); + i_gid_write(inode, be32_to_cpu(from->di_gid)); /* * Time is signed, so need to convert to signed 32 bit before @@ -238,21 +217,16 @@ xfs_inode_from_disk( inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); - inode->i_generation = be32_to_cpu(from->di_gen); - inode->i_mode = be16_to_cpu(from->di_mode); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; to->di_dmevmask = be32_to_cpu(from->di_dmevmask); to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); - if (to->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); @@ -260,6 +234,22 @@ xfs_inode_from_disk( to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } + + error = xfs_iformat_data_fork(ip, from); + if (error) + return error; + if (from->di_forkoff) { + error = xfs_iformat_attr_fork(ip, from); + if (error) + goto out_destroy_data_fork; + } + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + return 0; + +out_destroy_data_fork: + xfs_idestroy_fork(&ip->i_df); + return error; } void @@ -274,10 +264,9 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); + to->di_format = xfs_ifork_format(&ip->i_df); + to->di_uid = cpu_to_be32(i_uid_read(inode)); + to->di_gid = cpu_to_be32(i_gid_read(inode)); to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); @@ -295,15 +284,16 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; + to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_dmevmask = cpu_to_be32(from->di_dmevmask); to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); @@ -315,6 +305,7 @@ xfs_inode_to_disk( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -417,7 +408,7 @@ xfs_dinode_verify_forkoff( struct xfs_dinode *dip, struct xfs_mount *mp) { - if (!XFS_DFORK_Q(dip)) + if (!dip->di_forkoff) return NULL; switch (dip->di_format) { @@ -428,7 +419,7 @@ xfs_dinode_verify_forkoff( case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: - if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: @@ -454,7 +445,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -520,7 +511,7 @@ xfs_dinode_verify( return __this_address; } - if (XFS_DFORK_Q(dip)) { + if (dip->di_forkoff) { fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); if (fa) return fa; @@ -597,125 +588,6 @@ xfs_dinode_calc_crc( } /* - * Read the disk inode attributes into the in-core inode structure. - * - * For version 5 superblocks, if we are initialising a new inode and we are not - * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new - * inode core with a random generation number. If we are keeping inodes around, - * we need to read the inode cluster to get the existing generation number off - * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode - * format) then log recovery is dependent on the di_flushiter field being - * initialised from the current on-disk value and hence we must also read the - * inode off disk. - */ -int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) -{ - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_failaddr_t fa; - int error; - - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; - - /* shortcut IO on inode allocation if possible */ - if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { - VFS_I(ip)->i_generation = prandom_u32(); - ip->i_d.di_version = 3; - return 0; - } - - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; - - /* even unallocated inodes are verified */ - fa = xfs_dinode_verify(mp, ip->i_ino, dip); - if (fa) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip, - sizeof(*dip), fa); - error = -EFSCORRUPTED; - goto out_brelse; - } - - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat_fork() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_inode_from_disk(ip, dip); - error = xfs_iformat_fork(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - /* - * Partial initialisation of the in-core inode. Just the bits - * that xfs_ialloc won't overwrite or relies on being correct. - */ - ip->i_d.di_version = dip->di_version; - VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - VFS_I(ip)->i_mode = 0; - } - - ASSERT(ip->i_d.di_version >= 2); - ip->i_delayed_blks = 0; - - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); - - /* - * Use xfs_trans_brelse() to release the buffer containing the on-disk - * inode, because it was acquired with xfs_trans_read_buf() in - * xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be locking the - * new in-core inode before putting it in the cache where other - * processes can find it. Thus we don't have to worry about the inode - * being changed just because we released the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Validate di_extsize hint. * * The rules are documented at xfs_ioctl_setattr_check_extsize(). diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index fd94b1078722..6b08b9d060c2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,19 +16,12 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_version; /* inode version */ - int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_uid; /* owner's user id */ - uint32_t di_gid; /* owner's group id */ uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ uint16_t di_dmstate; /* DMIG state info */ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ @@ -51,24 +44,14 @@ struct xfs_imap { int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); + struct xfs_buf **, uint); void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); -void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); +int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); - -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ - xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index ad2b9c313fd2..0cf853d42d62 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,110 +26,6 @@ kmem_zone_t *xfs_ifork_zone; -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); - -/* - * Copy inode type and data and attr format specific information from the - * on-disk inode to the in-core inode and fork structures. For fifos, devices, - * and sockets this means set i_rdev to the proper value. For files, - * directories, and symlinks this means to bring in the in-line data or extent - * pointers as well as the attribute fork. For a fork in B-tree format, only - * the root is immediately brought in-core. The rest will be read in later when - * first referenced (see xfs_iread_extents()). - */ -int -xfs_iformat_fork( - struct xfs_inode *ip, - struct xfs_dinode *dip) -{ - struct inode *inode = VFS_I(ip); - struct xfs_attr_shortform *atp; - int size; - int error = 0; - xfs_fsize_t di_size; - - switch (inode->i_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - ip->i_d.di_size = 0; - inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); - break; - - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - di_size = be64_to_cpu(dip->di_size); - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - break; - - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - if (error) - return error; - - if (xfs_is_reflink_inode(ip)) { - ASSERT(ip->i_cowfp == NULL); - xfs_ifork_init_cow(ip); - } - - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - xfs_inode_verifier_error(ip, error, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - break; - } - if (error) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - if (ip->i_cowfp) - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); - } - return error; -} - void xfs_init_local_fork( struct xfs_inode *ip, @@ -183,7 +79,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", + "corrupt inode %Lu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -292,12 +188,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + ifp->if_nextents > ip->i_d.di_nblocks) || level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); @@ -325,6 +220,110 @@ xfs_iformat_btree( return 0; } +int +xfs_iformat_data_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct inode *inode = VFS_I(ip); + int error; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_df.if_format = dip->di_format; + ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_size = 0; + inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); + return 0; + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (ip->i_df.if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, + be64_to_cpu(dip->di_size)); + if (!error) + error = xfs_ifork_verify_local_data(ip); + return error; + case XFS_DINODE_FMT_EXTENTS: + return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_BTREE: + return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } + break; + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } +} + +static uint16_t +xfs_dfork_attr_shortform_size( + struct xfs_dinode *dip) +{ + struct xfs_attr_shortform *atp = + (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + + return be16_to_cpu(atp->hdr.totsize); +} + +int +xfs_iformat_attr_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + int error = 0; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); + ip->i_afp->if_format = dip->di_aformat; + if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; + ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents); + + switch (ip->i_afp->if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, + xfs_dfork_attr_shortform_size(dip)); + if (!error) + error = xfs_ifork_verify_local_attr(ip); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + break; + } + + if (error) { + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } + return error; +} + /* * Reallocate the space for if_broot based on the number of records * being added or deleted as indicated in rec_diff. Move the records @@ -504,38 +503,24 @@ xfs_idata_realloc( void xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) + struct xfs_ifork *ifp) { - struct xfs_ifork *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); if (ifp->if_broot != NULL) { kmem_free(ifp->if_broot); ifp->if_broot = NULL; } /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. + * If the format is local, then we can't have an extents array so just + * look for an inline data array. If we're not local then we may or may + * not have an extents list, so check and free it up if we do. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if (ifp->if_u1.if_data != NULL) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) { - xfs_iext_destroy(ifp); - } - - if (whichfork == XFS_ATTR_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } else if (whichfork == XFS_COW_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + } else if (ifp->if_flags & XFS_IFEXTENTS) { + if (ifp->if_height) + xfs_iext_destroy(ifp); } } @@ -592,7 +577,7 @@ void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, + struct xfs_inode_log_item *iip, int whichfork) { char *cp; @@ -618,7 +603,7 @@ xfs_iflush_fork( } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { @@ -633,7 +618,7 @@ xfs_iflush_fork( !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + ASSERT(ifp->if_nextents > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); } @@ -688,51 +673,58 @@ xfs_ifork_init_cow( if (ip->i_cowfp) return; - ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_NOFS); + ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone, + GFP_NOFS | __GFP_NOFAIL); ip->i_cowfp->if_flags = XFS_IFEXTENTS; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; - ip->i_cnextents = 0; + ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } -/* Default fork content verifiers. */ -struct xfs_ifork_ops xfs_default_ifork_ops = { - .verify_attr = xfs_attr_shortform_verify, - .verify_dir = xfs_dir2_sf_verify, - .verify_symlink = xfs_symlink_shortform_verify, -}; - /* Verify the inline contents of the data fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_data( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_data( + struct xfs_inode *ip) { - /* Non-local data fork, we're done. */ - if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return NULL; + xfs_failaddr_t fa = NULL; - /* Check the inline data fork if there is one. */ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFDIR: - return ops->verify_dir(ip); + fa = xfs_dir2_sf_verify(ip); + break; case S_IFLNK: - return ops->verify_symlink(ip); + fa = xfs_symlink_shortform_verify(ip); + break; default: - return NULL; + break; } + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", + ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + return -EFSCORRUPTED; + } + + return 0; } /* Verify the inline contents of the attr fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_attr( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_attr( + struct xfs_inode *ip) { - /* There has to be an attr fork allocated if aformat is local. */ - if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return NULL; - if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK)) - return __this_address; - return ops->verify_attr(ip); + struct xfs_ifork *ifp = ip->i_afp; + xfs_failaddr_t fa; + + if (!ifp) + fa = __this_address; + else + fa = xfs_attr_shortform_verify(ip); + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", + ifp ? ifp->if_u1.if_data : NULL, + ifp ? ifp->if_bytes : 0, fa); + return -EFSCORRUPTED; + } + + return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 500333d0101e..a4953e95c4f3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -23,6 +23,8 @@ struct xfs_ifork { } if_u1; short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ + int8_t if_format; /* format of this fork */ + xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -46,57 +48,45 @@ struct xfs_ifork { (ip)->i_afp : \ (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) + (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) + (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ ((w) == XFS_ATTR_FORK ? \ XFS_IFORK_ASIZE(ip) : \ 0)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_aformat : \ - (ip)->i_cformat)) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_aformat = (n)) : \ - ((ip)->i_cformat = (n)))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_anextents : \ - (ip)->i_cnextents)) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_anextents = (n)) : \ - ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) -#define xfs_ifork_has_extents(ip, w) \ - (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \ - XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE) +static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) +{ + return ifp->if_format == XFS_DINODE_FMT_EXTENTS || + ifp->if_format == XFS_DINODE_FMT_BTREE; +} + +static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp) +{ + if (!ifp) + return 0; + return ifp->if_nextents; +} + +static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) +{ + if (!ifp) + return XFS_DINODE_FMT_EXTENTS; + return ifp->if_format; +} struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); -int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); -void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idestroy_fork(struct xfs_ifork *ifp); void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); @@ -180,18 +170,7 @@ extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); -typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *); - -struct xfs_ifork_ops { - xfs_ifork_verifier_t verify_symlink; - xfs_ifork_verifier_t verify_dir; - xfs_ifork_verifier_t verify_attr; -}; -extern struct xfs_ifork_ops xfs_default_ifork_ops; - -xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); -xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); +int xfs_ifork_verify_local_data(struct xfs_inode *ip); +int xfs_ifork_verify_local_attr(struct xfs_inode *ip); #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 9bac0d2e56dc..e3400c9c71cd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -424,12 +424,10 @@ struct xfs_log_dinode { /* structure must be padded to 64 bit alignment */ }; -static inline uint xfs_log_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_log_dinode); - return offsetof(struct xfs_log_dinode, di_next_unlinked); -} +#define xfs_log_dinode_size(mp) \ + (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + sizeof(struct xfs_log_dinode) : \ + offsetof(struct xfs_log_dinode, di_next_unlinked)) /* * Buffer Log Format definitions diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3bf671637a91..641132d0e39d 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -7,6 +7,73 @@ #define __XFS_LOG_RECOVER_H__ /* + * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to + * define how recovery should work for that type of log item. + */ +struct xlog_recover_item; + +/* Sorting hat for log items as they're read in. */ +enum xlog_recover_reorder { + XLOG_REORDER_BUFFER_LIST, + XLOG_REORDER_ITEM_LIST, + XLOG_REORDER_INODE_BUFFER_LIST, + XLOG_REORDER_CANCEL_LIST, +}; + +struct xlog_recover_item_ops { + uint16_t item_type; /* XFS_LI_* type code. */ + + /* + * Help sort recovered log items into the order required to replay them + * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do + * not have to supply a function here. See the comment preceding + * xlog_recover_reorder_trans for more details about what the return + * values mean. + */ + enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item); + + /* Start readahead for pass2, if provided. */ + void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item); + + /* Do whatever work we need to do for pass1, if provided. */ + int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item); + + /* + * This function should do whatever work is needed for pass2 of log + * recovery, if provided. + * + * If the recovered item is an intent item, this function should parse + * the recovered item to construct an in-core log intent item and + * insert it into the AIL. The in-core log intent item should have 1 + * refcount so that the item is freed either (a) when we commit the + * recovered log item for the intent-done item; (b) replay the work and + * log a new intent-done item; or (c) recovery fails and we have to + * abort. + * + * If the recovered item is an intent-done item, this function should + * parse the recovered item to find the id of the corresponding intent + * log item. Next, it should find the in-core log intent item in the + * AIL and release it. + */ + int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list, + struct xlog_recover_item *item, xfs_lsn_t lsn); +}; + +extern const struct xlog_recover_item_ops xlog_icreate_item_ops; +extern const struct xlog_recover_item_ops xlog_buf_item_ops; +extern const struct xlog_recover_item_ops xlog_inode_item_ops; +extern const struct xlog_recover_item_ops xlog_dquot_item_ops; +extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops; +extern const struct xlog_recover_item_ops xlog_bui_item_ops; +extern const struct xlog_recover_item_ops xlog_bud_item_ops; +extern const struct xlog_recover_item_ops xlog_efi_item_ops; +extern const struct xlog_recover_item_ops xlog_efd_item_ops; +extern const struct xlog_recover_item_ops xlog_rui_item_ops; +extern const struct xlog_recover_item_ops xlog_rud_item_ops; +extern const struct xlog_recover_item_ops xlog_cui_item_ops; +extern const struct xlog_recover_item_ops xlog_cud_item_ops; + +/* * Macros, structures, prototypes for internal log manager use. */ @@ -22,13 +89,13 @@ /* * item headers are in ri_buf[0]. Additional buffers follow. */ -typedef struct xlog_recover_item { +struct xlog_recover_item { struct list_head ri_list; - int ri_type; int ri_cnt; /* count of regions found */ int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ -} xlog_recover_item_t; + struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */ + const struct xlog_recover_item_ops *ri_ops; +}; struct xlog_recover { struct hlist_node r_list; @@ -51,4 +118,12 @@ struct xlog_recover { #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 +void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, + const struct xfs_buf_ops *ops); +bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); +void xlog_recover_iodone(struct xfs_buf *bp); + +void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, + uint64_t intent_id); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index b2113b17e53c..076bdc7037ee 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -18,23 +18,22 @@ typedef uint64_t xfs_qcnt_t; typedef uint16_t xfs_qwarncnt_t; +typedef uint8_t xfs_dqtype_t; + +#define XFS_DQTYPE_STRINGS \ + { XFS_DQTYPE_USER, "USER" }, \ + { XFS_DQTYPE_PROJ, "PROJ" }, \ + { XFS_DQTYPE_GROUP, "GROUP" } + /* * flags for q_flags field in the dquot. */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */ - -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) +#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ -#define XFS_DQ_FLAGS \ - { XFS_DQ_USER, "USER" }, \ - { XFS_DQ_PROJ, "PROJ" }, \ - { XFS_DQ_GROUP, "GROUP" }, \ - { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_FREEING, "FREEING" } +#define XFS_DQFLAG_STRINGS \ + { XFS_DQFLAG_DIRTY, "DIRTY" }, \ + { XFS_DQFLAG_FREEING, "FREEING" } /* * We have the possibility of all three quota types being active at once, and @@ -100,7 +99,6 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be @@ -138,11 +136,11 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); + struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, - struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); + struct xfs_dqblk *dqb, xfs_dqid_t id); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, uint type); + xfs_dqid_t id, xfs_dqtype_t type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6e1665f2cb67..2076627243b0 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,7 +46,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -63,7 +63,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -80,7 +80,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -108,7 +108,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -119,7 +119,7 @@ xfs_refcount_get_rec( xfs_refcount_btrec_to_irec(rec, irec); - agno = cur->bc_private.a.agno; + agno = cur->bc_ag.agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -144,7 +144,7 @@ xfs_refcount_get_rec( if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec); return 0; out_bad_rec: @@ -169,14 +169,14 @@ xfs_refcount_update( union xfs_btree_rec rec; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec); rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -193,7 +193,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; @@ -208,7 +208,7 @@ xfs_refcount_insert( out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -234,7 +234,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { error = -EFSCORRUPTED; @@ -246,7 +246,7 @@ xfs_refcount_delete( out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -366,7 +366,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno, &rcext, agbno); /* Establish the right extent. */ @@ -391,7 +391,7 @@ xfs_refcount_split_extent( out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -411,7 +411,7 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_private.a.agno, left, center, right); + cur->bc_ag.agno, left, center, right); /* * Make sure the center and right extents are not in the btree. @@ -468,7 +468,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -487,7 +487,7 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_private.a.agno, left, cleft); + cur->bc_ag.agno, left, cleft); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { @@ -530,7 +530,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -548,7 +548,7 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_private.a.agno, cright, right); + cur->bc_ag.agno, cright, right); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, @@ -594,7 +594,7 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -679,13 +679,13 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -768,13 +768,13 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = aglen; cright->rc_refcount = 1; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -883,7 +883,7 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_private.a.priv.refc.shape_changes * + overhead = cur->bc_ag.refc.shape_changes * xfs_allocfree_log_count(cur->bc_mp, 1); overhead *= cur->bc_mp->m_sb.sb_blocksize; @@ -891,17 +891,17 @@ xfs_refcount_still_have_space( * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_private.a.priv.refc.nr_ops > 2 && + if (cur->bc_ag.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_private.a.priv.refc.nr_ops == 0) + if (cur->bc_ag.refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -952,7 +952,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); /* * Either cover the hole (increment) or @@ -968,10 +968,10 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, tmp.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, tmp.rc_blockcount, oinfo); @@ -998,12 +998,12 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1012,11 +1012,11 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, ext.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, oinfo); @@ -1035,7 +1035,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1057,10 +1057,10 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* @@ -1088,7 +1088,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_private.a.priv.refc.shape_changes++; + cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, @@ -1099,7 +1099,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1115,7 +1115,7 @@ xfs_refcount_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -1165,9 +1165,9 @@ xfs_refcount_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { - nr_ops = rcur->bc_private.a.priv.refc.nr_ops; - shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + if (rcur != NULL && rcur->bc_ag.agno != agno) { + nr_ops = rcur->bc_ag.refc.nr_ops; + shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -1183,8 +1183,8 @@ xfs_refcount_finish_one( error = -ENOMEM; goto out_cur; } - rcur->bc_private.a.priv.refc.nr_ops = nr_ops; - rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + rcur->bc_ag.refc.nr_ops = nr_ops; + rcur->bc_ag.refc.shape_changes = shape_changes; } *pcur = rcur; @@ -1303,7 +1303,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* By default, skip the whole range */ @@ -1383,12 +1383,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_private.a.agno, *fbno, *flen); + cur->bc_ag.agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1485,7 +1485,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1513,7 +1513,7 @@ xfs_refcount_adjust_cow_extents( ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1529,7 +1529,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1575,7 +1575,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1589,7 +1589,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1606,7 +1606,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Remove refcount btree reservation */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 38529dbacd55..a6ac60ae9421 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" @@ -25,7 +26,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -34,17 +35,15 @@ xfs_refcountbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_refcount_root = ptr->s; be32_add_cpu(&agf->agf_refcount_level, inc); pag->pagf_refcount_level += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); @@ -57,8 +56,8 @@ xfs_refcountbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ @@ -66,7 +65,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -75,13 +74,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.agno == cur->bc_ag.agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -101,12 +100,12 @@ xfs_refcountbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -169,9 +168,9 @@ xfs_refcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -311,42 +310,91 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Allocate a new refcount btree cursor. + * Initialize a new refcount btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( +static struct xfs_btree_cur * +xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(agno != NULLAGNUMBER); ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_refcountbt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); - - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.priv.refc.nr_ops = 0; - cur->bc_private.a.priv.refc.shape_changes = 0; + cur->bc_ag.refc.nr_ops = 0; + cur->bc_ag.refc.shape_changes = 0; + cur->bc_ops = &xfs_refcountbt_ops; + return cur; +} + +/* Create a btree cursor. */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + cur = xfs_refcountbt_init_common(mp, tp, agno); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + cur->bc_ag.agbp = agbp; + return cur; +} + +/* Create a btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_refcountbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Swap in the new btree root. Once we pass this point the newly rebuilt btree + * is in place and we have to kill off all the old btree blocks. + */ +void +xfs_refcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_refcount_root = cpu_to_be32(afake->af_root); + agf->agf_refcount_level = cpu_to_be32(afake->af_levels); + agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | + XFS_AGF_REFCOUNT_ROOT | + XFS_AGF_REFCOUNT_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); +} + +/* * Calculate the number of records in a refcount btree block. */ int @@ -420,7 +468,7 @@ xfs_refcountbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index ba416f71c824..69dc515db671 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size @@ -46,6 +47,8 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); @@ -58,4 +61,7 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ff9412f113c4..27c39268c31f 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -79,7 +79,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -91,7 +91,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -107,7 +107,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -133,7 +133,7 @@ xfs_rmap_insert( done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -149,7 +149,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -170,7 +170,7 @@ xfs_rmap_delete( done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -197,7 +197,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -260,7 +260,7 @@ xfs_rmap_find_left_neighbor_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -312,7 +312,7 @@ xfs_rmap_find_left_neighbor( info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); @@ -320,7 +320,7 @@ xfs_rmap_find_left_neighbor( error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -336,7 +336,7 @@ xfs_rmap_lookup_le_range_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -385,14 +385,14 @@ xfs_rmap_lookup_le_range( info.stat = stat; trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -498,7 +498,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -522,7 +522,7 @@ xfs_rmap_unmap( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -588,7 +588,7 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -666,7 +666,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -678,11 +678,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -773,7 +773,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -795,7 +795,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -831,7 +831,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -870,7 +870,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -921,7 +921,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) @@ -932,11 +932,11 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1010,7 +1010,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1034,7 +1034,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, PREV.rm_startblock, + cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1076,7 +1076,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, LEFT.rm_startblock, + cur->bc_ag.agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1114,7 +1114,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1132,7 +1132,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* reset the cursor back to PREV */ @@ -1162,7 +1162,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1180,7 +1180,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1210,7 +1210,7 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1247,7 +1247,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1326,7 +1326,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1383,7 +1383,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1414,7 +1414,7 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); @@ -1441,7 +1441,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1465,12 +1465,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1506,7 +1506,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1573,7 +1573,7 @@ xfs_rmap_convert_shared( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1589,7 +1589,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1880,12 +1880,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1923,7 +1923,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -2072,12 +2072,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2112,7 +2112,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -2138,7 +2138,7 @@ xfs_rmap_map_shared( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -2231,12 +2231,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2336,7 +2336,7 @@ xfs_rmap_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -2386,7 +2386,7 @@ xfs_rmap_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { + if (rcur != NULL && rcur->bc_ag.agno != agno) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2694,7 +2694,6 @@ struct xfs_rmap_key_state { uint64_t owner; uint64_t offset; unsigned int flags; - bool has_rmap; }; /* For each rmap given, figure out if it doesn't match the key we want. */ @@ -2709,7 +2708,6 @@ xfs_rmap_has_other_keys_helper( if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; - rks->has_rmap = true; return -ECANCELED; } @@ -2731,7 +2729,7 @@ xfs_rmap_has_other_keys( int error; xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - rks.has_rmap = false; + *has_rmap = false; low.rm_startblock = bno; memset(&high, 0xFF, sizeof(high)); @@ -2739,11 +2737,12 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); - if (error < 0) - return error; + if (error == -ECANCELED) { + *has_rmap = true; + return 0; + } - *has_rmap = rks.has_rmap; - return 0; + return error; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index fc78efa52c94..beb81c84a937 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -14,6 +14,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" @@ -51,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -60,18 +61,16 @@ xfs_rmapbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -83,25 +82,25 @@ xfs_rmapbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int error; xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); @@ -109,7 +108,7 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); *stat = 1; return 0; @@ -120,13 +119,14 @@ xfs_rmapbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag; xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -138,8 +138,8 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); - + pag = cur->bc_ag.agbp->b_pag; + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); return 0; } @@ -215,9 +215,9 @@ xfs_rmapbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -448,37 +448,83 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .recs_inorder = xfs_rmapbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * -xfs_rmapbt_init_cursor( +static struct xfs_btree_cur * +xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; /* Overlapping btree; 2 keys per pointer. */ cur->bc_btnum = XFS_BTNUM_RMAP; cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; + + return cur; +} + +/* Create a new reverse mapping btree cursor. */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_rmapbt_init_common(mp, tp, agno); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agbp = agbp; + return cur; +} - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; +/* Create a new reverse mapping btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_rmapbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + cur = xfs_rmapbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | + XFS_AGF_RMAP_BLOCKS); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); +} + +/* * Calculate number of records in an rmap btree block. */ int @@ -569,7 +615,7 @@ xfs_rmapbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 820d668b063d..115c3455a734 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -9,6 +9,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -43,6 +44,10 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); +void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f42c74cb8be5..1d9fa8a300f1 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -66,11 +66,11 @@ xfs_rtbuf_get( ip = issum ? mp->m_rsumip : mp->m_rbmip; - error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 2f60fc3c99a0..ae9aaf1f34bf 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -220,7 +220,7 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; @@ -243,7 +243,7 @@ xfs_validate_sb_common( } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, -"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); +"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits."); return -EFSCORRUPTED; } @@ -328,6 +328,38 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); + return -EFSCORRUPTED; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; + + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } + } + if (sbp->sb_unit) { if (!xfs_sb_version_hasdalign(sbp) || sbp->sb_unit > sbp->sb_width || @@ -568,7 +600,7 @@ xfs_sb_quota_to_disk( * disk. If neither are active, we should NULL the inode. * * In all cases, the separate pquotino must remain 0 because it - * it beyond the "end" of the valid non-pquotino superblock. + * is beyond the "end" of the valid non-pquotino superblock. */ if (from->sb_qflags & XFS_GQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_gquotino); @@ -681,7 +713,7 @@ xfs_sb_read_verify( { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; int error; /* @@ -707,7 +739,7 @@ xfs_sb_read_verify( * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -730,7 +762,7 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ @@ -748,13 +780,14 @@ xfs_sb_write_verify( struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -766,7 +799,7 @@ xfs_sb_write_verify( return; if (bip) - XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; @@ -927,7 +960,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } @@ -1007,7 +1040,7 @@ xfs_update_secondary_sbs( bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c45acbd3add9..708feb8eac76 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 3b8260ca7d1b..594bc447a7dd 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -204,16 +204,12 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - char *sfp; - char *endp; - struct xfs_ifork *ifp; - int size; - - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - sfp = (char *)ifp->if_u1.if_data; - size = ifp->if_bytes; - endp = sfp + size; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + char *sfp = (char *)ifp->if_u1.if_data; + int size = ifp->if_bytes; + char *endp = sfp + size; + + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); /* * Zero length symlinks should never occur in memory as they are diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 2b8ccb5b975d..b7e222befb08 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -27,7 +29,7 @@ xfs_trans_ijoin( struct xfs_inode *ip, uint lock_flags) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) @@ -36,6 +38,7 @@ xfs_trans_ijoin( ASSERT(iip->ili_lock_flags == 0); iip->ili_lock_flags = lock_flags; + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Get a log_item_desc to point at the new item. @@ -71,24 +74,35 @@ xfs_trans_ichgtime( } /* - * This is called to mark the fields indicated in fieldmask as needing - * to be logged when the transaction is committed. The inode must - * already be associated with the given transaction. + * This is called to mark the fields indicated in fieldmask as needing to be + * logged when the transaction is committed. The inode must already be + * associated with the given transaction. * - * The values for fieldmask are defined in xfs_inode_item.h. We always - * log all of the core inode if any of it has changed, and we always log - * all of the inline data/extents/b-tree root if any of them has changed. + * The values for fieldmask are defined in xfs_inode_item.h. We always log all + * of the core inode if any of it has changed, and we always log all of the + * inline data/extents/b-tree root if any of them has changed. + * + * Grab and pin the cluster buffer associated with this inode to avoid RMW + * cycles at inode writeback time. Avoid the need to add error handling to every + * xfs_trans_log_inode() call by shutting down on read error. This will cause + * transactions to fail and everything to error out, just like if we return a + * read error in a dirty transaction and cancel it. */ void xfs_trans_log_inode( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + uint flags) { - struct inode *inode = VFS_I(ip); + struct xfs_inode_log_item *iip = ip->i_itemp; + struct inode *inode = VFS_I(ip); + uint iversion_flags = 0; - ASSERT(ip->i_itemp != NULL); + ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); + + tp->t_flags |= XFS_TRANS_DIRTY; /* * Don't bother with i_lock for the I_DIRTY_TIME check here, as races @@ -96,22 +110,13 @@ xfs_trans_log_inode( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); } /* - * Record the specific change for fdatasync optimisation. This - * allows fdatasync to skip log forces for inodes that are only - * timestamp dirty. We do this before the change count so that - * the core being logged in this case does not impact on fdatasync - * behaviour. - */ - ip->i_itemp->ili_fsync_fields |= flags; - - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually @@ -120,23 +125,64 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && - IS_I_VERSION(VFS_I(ip))) { - if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) - flags |= XFS_ILOG_CORE; + if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { + if (IS_I_VERSION(inode) && + inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) + iversion_flags = XFS_ILOG_CORE; } - tp->t_flags |= XFS_TRANS_DIRTY; + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= flags; + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL, + &bp, 0); + if (error) { + xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); + return; + } + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } /* - * Always OR in the bits from the ili_last_fields field. - * This is to coordinate with the xfs_iflush() and xfs_iflush_done() - * routines in the eventual clearing of the ili_fields bits. - * See the big comment in xfs_iflush() for an explanation of - * this coordination mechanism. + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_iflush_done() routines in + * the eventual clearing of the ili_fields bits. See the big comment in + * xfs_iflush() for an explanation of this coordination mechanism. */ - flags |= ip->i_itemp->ili_last_fields; - ip->i_itemp->ili_fields |= flags; + iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); + spin_unlock(&iip->ili_lock); } int diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 7a9c04920505..d1a0848cb52e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_has_v3inode(&mp->m_sb)) return res; size = XFS_FSB_TO_B(mp, 1); } diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 88221c7a04cc..c6df01a2a158 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -57,7 +57,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ (M_IGEO(mp)->inobt_maxlevels - 1))) /* diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ba0f747c82e8..e9bcf1faa183 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -92,7 +92,7 @@ xchk_superblock( if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) return error; - sb = XFS_BUF_TO_SBP(bp); + sb = bp->b_addr; /* * Verify the geometries match. Fields that are permanently @@ -358,7 +358,7 @@ static inline void xchk_agf_xref_freeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_extlen_t blocks = 0; int error; @@ -378,7 +378,7 @@ static inline void xchk_agf_xref_cntbt( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t agbno; xfs_extlen_t blocks; int have; @@ -410,7 +410,7 @@ STATIC void xchk_agf_xref_btreeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t blocks; xfs_agblock_t btreeblks; @@ -456,7 +456,7 @@ static inline void xchk_agf_xref_refcblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t blocks; int error; @@ -525,7 +525,7 @@ xchk_agf( goto out; xchk_buffer_recheck(sc, sc->sa.agf_bp); - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); @@ -711,7 +711,7 @@ xchk_agfl( goto out; /* Allocate buffer to ensure uniqueness of AGFL entries. */ - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; agflcount = be32_to_cpu(agf->agf_flcount); if (agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -728,7 +728,7 @@ xchk_agfl( } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sc->sa.agfl_bp, xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; @@ -765,7 +765,7 @@ static inline void xchk_agi_xref_icounts( struct xfs_scrub *sc) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; xfs_agino_t icount; xfs_agino_t freecount; int error; @@ -834,7 +834,7 @@ xchk_agi( goto out; xchk_buffer_recheck(sc, sc->sa.agi_bp); - agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + agi = sc->sa.agi_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d5e6db9af434..bca2ab1d4be9 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -49,7 +49,7 @@ xrep_superblock( /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); @@ -140,7 +140,7 @@ xrep_agf_find_btrees( struct xrep_find_ag_btree *fab, struct xfs_buf *agfl_bp) { - struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *old_agf = agf_bp->b_addr; int error; /* Go find the root data. */ @@ -181,7 +181,7 @@ xrep_agf_init_header( struct xfs_agf *old_agf) { struct xfs_mount *mp = sc->mp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; memcpy(old_agf, agf, sizeof(*old_agf)); memset(agf, 0, BBTOB(agf_bp->b_length)); @@ -238,7 +238,7 @@ xrep_agf_calc_from_btrees( { struct xrep_agf_allocbt raa = { .sc = sc }; struct xfs_btree_cur *cur = NULL; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; xfs_agblock_t blocks; @@ -302,7 +302,7 @@ xrep_agf_commit_new( struct xfs_buf *agf_bp) { struct xfs_perag *pag; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; /* Trigger fdblocks recalculation */ xfs_force_summary_recalc(sc->mp); @@ -376,7 +376,7 @@ xrep_agf( if (error) return error; agf_bp->b_ops = &xfs_agf_buf_ops; - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; /* * Load the AGFL so that we can screen out OWN_AG blocks that are on @@ -395,7 +395,7 @@ xrep_agf( * Spot-check the AGFL blocks; if they're obviously corrupt then * there's nothing we can do but bail out. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp, + error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp, xrep_agf_check_agfl_block, sc); if (error) return error; @@ -429,10 +429,10 @@ out_revert: struct xrep_agfl { /* Bitmap of other OWN_AG metadata blocks. */ - struct xfs_bitmap agmetablocks; + struct xbitmap agmetablocks; /* Bitmap of free space. */ - struct xfs_bitmap *freesp; + struct xbitmap *freesp; struct xfs_scrub *sc; }; @@ -453,14 +453,14 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, rec->rm_startblock); - error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); if (error) return error; } - return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur); + return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } /* @@ -476,19 +476,17 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; int error; ra.sc = sc; ra.freesp = agfl_extents; - xfs_bitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); @@ -500,7 +498,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_BNO); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; xfs_btree_del_cursor(cur, error); @@ -508,7 +506,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_CNT); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -518,8 +516,8 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks); - xfs_bitmap_destroy(&ra.agmetablocks); + error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); if (error) return error; @@ -527,18 +525,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = 0; - for_each_xfs_bitmap_extent(br, n, agfl_extents) { - *flcount += br->len; - if (*flcount > xfs_agfl_size(mp)) - break; - } - if (*flcount > xfs_agfl_size(mp)) - *flcount = xfs_agfl_size(mp); + *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + xfs_agfl_size(mp)); return 0; err: - xfs_bitmap_destroy(&ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); xfs_btree_del_cursor(cur, error); return error; } @@ -550,7 +542,7 @@ xrep_agfl_update_agf( struct xfs_buf *agf_bp, xfs_agblock_t flcount) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; ASSERT(flcount <= xfs_agfl_size(sc->mp)); @@ -573,13 +565,13 @@ STATIC void xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t flcount) { struct xfs_mount *mp = sc->mp; __be32 *agfl_bno; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; + struct xbitmap_range *br; + struct xbitmap_range *n; struct xfs_agfl *agfl; xfs_agblock_t agbno; unsigned int fl_off; @@ -602,8 +594,8 @@ xrep_agfl_init_header( * step. */ fl_off = 0; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp); - for_each_xfs_bitmap_extent(br, n, agfl_extents) { + agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); + for_each_xbitmap_extent(br, n, agfl_extents) { agbno = XFS_FSB_TO_AGBNO(mp, br->start); trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len); @@ -637,7 +629,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xfs_bitmap agfl_extents; + struct xbitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -649,7 +641,7 @@ xrep_agfl( return -EOPNOTSUPP; xchk_perag_get(sc->mp, &sc->sa); - xfs_bitmap_init(&agfl_extents); + xbitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -696,10 +688,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xfs_bitmap_destroy(&agfl_extents); + xbitmap_destroy(&agfl_extents); return error; } @@ -761,7 +753,7 @@ xrep_agi_init_header( struct xfs_buf *agi_bp, struct xfs_agi *old_agi) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; memcpy(old_agi, agi, sizeof(*old_agi)); @@ -807,7 +799,7 @@ xrep_agi_calc_from_btrees( struct xfs_buf *agi_bp) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agino_t count; xfs_agino_t freecount; @@ -835,7 +827,7 @@ xrep_agi_commit_new( struct xfs_buf *agi_bp) { struct xfs_perag *pag; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; /* Trigger inode count recalculation */ xfs_force_summary_recalc(sc->mp); @@ -892,7 +884,7 @@ xrep_agi( if (error) return error; agi_bp->b_ops = &xfs_agi_buf_ops; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; /* Find the AGI btree roots. */ error = xrep_agi_find_btrees(sc, fab); diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 5533e48e605d..73d924e47565 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -94,7 +94,7 @@ xchk_allocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index d9f0dd444b80..9faddb334a2c 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -98,7 +98,7 @@ struct xchk_xattr { /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked) + * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) * to call this function for every attribute key in an inode. Once * we're here, we load the attribute value to see if any errors happen, * or if we get more or less data than we expected. @@ -147,11 +147,8 @@ xchk_xattr_listent( return; } - args.flags = ATTR_KERNOTIME; - if (flags & XFS_ATTR_ROOT) - args.flags |= ATTR_ROOT; - else if (flags & XFS_ATTR_SECURE) - args.flags |= ATTR_SECURE; + args.op_flags = XFS_DA_OP_NOTIME; + args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; args.geo = context->dp->i_mount->m_attr_geo; args.whichfork = XFS_ATTR_FORK; args.dp = context->dp; @@ -162,7 +159,10 @@ xchk_xattr_listent( args.value = xchk_xattr_valuebuf(sx->sc); args.valuelen = valuelen; - error = xfs_attr_get_ilocked(context->dp, &args); + error = xfs_attr_get_ilocked(&args); + /* ENODATA means the hash lookup failed and the attr is bad */ + if (error == -ENODATA) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -474,7 +474,6 @@ xchk_xattr( struct xfs_scrub *sc) { struct xchk_xattr sx; - struct attrlist_cursor_kern cursor = { 0 }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -493,11 +492,10 @@ xchk_xattr( /* Check that every attr key can also be looked up by hash. */ sx.context.dp = sc->ip; - sx.context.cursor = &cursor; sx.context.resynch = 1; sx.context.put_listent = xchk_xattr_listent; sx.context.tp = sc->tp; - sx.context.flags = ATTR_INCOMPLETE; + sx.context.allow_incomplete = true; sx.sc = sc; /* @@ -516,7 +514,7 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_int_ilocked(&sx.context); + error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 18a684e18a69..f88694f22d05 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -18,14 +18,14 @@ * This is the logical equivalent of bitmap |= mask(start, len). */ int -xfs_bitmap_set( - struct xfs_bitmap *bitmap, +xbitmap_set( + struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xfs_bitmap_range *bmr; + struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL); + bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!bmr) return -ENOMEM; @@ -39,13 +39,13 @@ xfs_bitmap_set( /* Free everything related to this bitmap. */ void -xfs_bitmap_destroy( - struct xfs_bitmap *bitmap) +xbitmap_destroy( + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; - for_each_xfs_bitmap_extent(bmr, n, bitmap) { + for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); kmem_free(bmr); } @@ -53,24 +53,24 @@ xfs_bitmap_destroy( /* Set up a per-AG block bitmap. */ void -xfs_bitmap_init( - struct xfs_bitmap *bitmap) +xbitmap_init( + struct xbitmap *bitmap) { INIT_LIST_HEAD(&bitmap->list); } /* Compare two btree extents. */ static int -xfs_bitmap_range_cmp( +xbitmap_range_cmp( void *priv, struct list_head *a, struct list_head *b) { - struct xfs_bitmap_range *ap; - struct xfs_bitmap_range *bp; + struct xbitmap_range *ap; + struct xbitmap_range *bp; - ap = container_of(a, struct xfs_bitmap_range, list); - bp = container_of(b, struct xfs_bitmap_range, list); + ap = container_of(a, struct xbitmap_range, list); + bp = container_of(b, struct xbitmap_range, list); if (ap->start > bp->start) return 1; @@ -96,14 +96,14 @@ xfs_bitmap_range_cmp( #define LEFT_ALIGNED (1 << 0) #define RIGHT_ALIGNED (1 << 1) int -xfs_bitmap_disunion( - struct xfs_bitmap *bitmap, - struct xfs_bitmap *sub) +xbitmap_disunion( + struct xbitmap *bitmap, + struct xbitmap *sub) { struct list_head *lp; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *new_br; - struct xfs_bitmap_range *sub_br; + struct xbitmap_range *br; + struct xbitmap_range *new_br; + struct xbitmap_range *sub_br; uint64_t sub_start; uint64_t sub_len; int state; @@ -113,8 +113,8 @@ xfs_bitmap_disunion( return 0; ASSERT(!list_empty(&sub->list)); - list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp); - list_sort(NULL, &sub->list, xfs_bitmap_range_cmp); + list_sort(NULL, &bitmap->list, xbitmap_range_cmp); + list_sort(NULL, &sub->list, xbitmap_range_cmp); /* * Now that we've sorted both lists, we iterate bitmap once, rolling @@ -124,11 +124,11 @@ xfs_bitmap_disunion( * list traversal is similar to merge sort, but we're deleting * instead. In this manner we avoid O(n^2) operations. */ - sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range, + sub_br = list_first_entry(&sub->list, struct xbitmap_range, list); lp = bitmap->list.next; while (lp != &bitmap->list) { - br = list_entry(lp, struct xfs_bitmap_range, list); + br = list_entry(lp, struct xbitmap_range, list); /* * Advance sub_br and/or br until we find a pair that @@ -181,7 +181,7 @@ xfs_bitmap_disunion( * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xfs_bitmap_range), + new_br = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!new_br) { error = -ENOMEM; @@ -247,8 +247,8 @@ out: * blocks going from the leaf towards the root. */ int -xfs_bitmap_set_btcur_path( - struct xfs_bitmap *bitmap, +xbitmap_set_btcur_path( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { struct xfs_buf *bp; @@ -261,7 +261,7 @@ xfs_bitmap_set_btcur_path( if (!bp) continue; fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - error = xfs_bitmap_set(bitmap, fsb, 1); + error = xbitmap_set(bitmap, fsb, 1); if (error) return error; } @@ -271,12 +271,12 @@ xfs_bitmap_set_btcur_path( /* Collect a btree's block in the bitmap. */ STATIC int -xfs_bitmap_collect_btblock( +xbitmap_collect_btblock( struct xfs_btree_cur *cur, int level, void *priv) { - struct xfs_bitmap *bitmap = priv; + struct xbitmap *bitmap = priv; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -285,15 +285,30 @@ xfs_bitmap_collect_btblock( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - return xfs_bitmap_set(bitmap, fsbno, 1); + return xbitmap_set(bitmap, fsbno, 1); } /* Walk the btree and mark the bitmap wherever a btree block is found. */ int -xfs_bitmap_set_btblocks( - struct xfs_bitmap *bitmap, +xbitmap_set_btblocks( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { - return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, + return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, XFS_BTREE_VISIT_ALL, bitmap); } + +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap_hweight( + struct xbitmap *bitmap) +{ + struct xbitmap_range *bmr; + struct xbitmap_range *n; + uint64_t ret = 0; + + for_each_xbitmap_extent(bmr, n, bitmap) + ret += bmr->len; + + return ret; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index ae8ecbce6fa6..900646b72de1 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,31 +6,32 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xfs_bitmap_range { +struct xbitmap_range { struct list_head list; uint64_t start; uint64_t len; }; -struct xfs_bitmap { +struct xbitmap { struct list_head list; }; -void xfs_bitmap_init(struct xfs_bitmap *bitmap); -void xfs_bitmap_destroy(struct xfs_bitmap *bitmap); +void xbitmap_init(struct xbitmap *bitmap); +void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xfs_bitmap_extent(bex, n, bitmap) \ +#define for_each_xbitmap_extent(bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) -#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \ +#define for_each_xbitmap_block(b, bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = bex->start; (b) < bex->start + bex->len; (b)++) + for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) -int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len); -int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub); -int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap, +int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); +int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); +int xbitmap_set_btcur_path(struct xbitmap *bitmap, struct xfs_btree_cur *cur); -int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap, +int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); +uint64_t xbitmap_hweight(struct xbitmap *bitmap); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa6ea6407992..955302e7cdde 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -45,9 +45,27 @@ xchk_setup_inode_bmap( */ if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + inode_dio_wait(VFS_I(sc->ip)); - error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); - if (error) + + /* + * Try to flush all incore state to disk before we examine the + * space mappings for the data fork. Leave accumulated errors + * in the mapping for the writer threads to consume. + * + * On ENOSPC or EIO writeback errors, we continue into the + * extent mapping checks because write failures do not + * necessarily imply anything about the correctness of the file + * metadata. The metadata and the file data could be on + * completely separate devices; a media failure might only + * affect a subset of the disk, etc. We can handle delalloc + * extents in the scrubber, so leaving them in memory is fine. + */ + error = filemap_fdatawrite(mapping); + if (!error) + error = filemap_fdatawait_keep_errors(mapping); + if (error && (error != -ENOSPC && error != -EIO)) goto out; } @@ -374,7 +392,7 @@ xchk_bmapbt_rec( struct xfs_bmbt_irec iext_irec; struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; - struct xfs_inode *ip = bs->cur->bc_private.b.ip; + struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); @@ -501,7 +519,7 @@ xchk_bmap_check_rmap( xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_private.a.agno, rec->rm_startblock)) + cur->bc_ag.agno, rec->rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_blockcount > rec->rm_blockcount) @@ -566,8 +584,9 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - loff_t size; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); xfs_agnumber_t agno; + bool zero_size; int error; if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || @@ -579,6 +598,8 @@ xchk_bmap_check_rmaps( if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) return 0; + ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL); + /* * Only do this for complex maps that are in btree format, or for * situations where we would seem to have a size but zero extents. @@ -586,19 +607,14 @@ xchk_bmap_check_rmaps( * to flag this bmap as corrupt if there are rmaps that need to be * reattached. */ - switch (whichfork) { - case XFS_DATA_FORK: - size = i_size_read(VFS_I(sc->ip)); - break; - case XFS_ATTR_FORK: - size = XFS_IFORK_Q(sc->ip); - break; - default: - size = 0; - break; - } - if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE && - (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0)) + + if (whichfork == XFS_DATA_FORK) + zero_size = i_size_read(VFS_I(sc->ip)) == 0; + else + zero_size = false; + + if (ifp->if_format != XFS_DINODE_FMT_BTREE && + (zero_size || ifp->if_nextents > 0)) return 0; for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { @@ -627,12 +643,14 @@ xchk_bmap( struct xchk_bmap_info info = { NULL }; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t endoff; struct xfs_iext_cursor icur; int error = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* Non-existent forks can be ignored. */ + if (!ifp) + goto out; info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); info.whichfork = whichfork; @@ -641,9 +659,6 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* Non-existent CoW forks are ignorable. */ - if (!ifp) - goto out; /* No CoW forks on non-reflink inodes/filesystems. */ if (!xfs_is_reflink_inode(ip)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -651,8 +666,6 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!ifp) - goto out_check_rmap; if (!xfs_sb_version_hasattr(&mp->m_sb) && !xfs_sb_version_hasattr2(&mp->m_sb)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -663,7 +676,7 @@ xchk_bmap( } /* Check the fork values */ - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_UUID: case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: @@ -717,7 +730,6 @@ xchk_bmap( goto out; } -out_check_rmap: error = xchk_bmap_check_rmaps(sc, whichfork); if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) goto out; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 97a15b6f2865..e56786f0a13c 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -219,19 +219,21 @@ xchk_da_btree_block_check_sibling( int direction, xfs_dablk_t sibling) { + struct xfs_da_state_path *path = &ds->state->path; + struct xfs_da_state_path *altpath = &ds->state->altpath; int retval; + int plevel; int error; - memcpy(&ds->state->altpath, &ds->state->path, - sizeof(ds->state->altpath)); + memcpy(altpath, path, sizeof(ds->state->altpath)); /* * If the pointer is null, we shouldn't be able to move the upper * level pointer anywhere. */ if (sibling == 0) { - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, + false, &retval); if (error == 0 && retval == 0) xchk_da_set_corrupt(ds, level); error = 0; @@ -239,27 +241,33 @@ xchk_da_btree_block_check_sibling( } /* Move the alternate cursor one block in the direction given. */ - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, false, + &retval); if (!xchk_da_process_error(ds, level, &error)) - return error; + goto out; if (retval) { xchk_da_set_corrupt(ds, level); - return error; + goto out; } - if (ds->state->altpath.blk[level].bp) - xchk_buffer_recheck(ds->sc, - ds->state->altpath.blk[level].bp); + if (altpath->blk[level].bp) + xchk_buffer_recheck(ds->sc, altpath->blk[level].bp); /* Compare upper level pointer to sibling pointer. */ - if (ds->state->altpath.blk[level].blkno != sibling) + if (altpath->blk[level].blkno != sibling) xchk_da_set_corrupt(ds, level); - if (ds->state->altpath.blk[level].bp) { - xfs_trans_brelse(ds->dargs.trans, - ds->state->altpath.blk[level].bp); - ds->state->altpath.blk[level].bp = NULL; - } + out: + /* Free all buffers in the altpath that aren't referenced from path. */ + for (plevel = 0; plevel < altpath->active; plevel++) { + if (altpath->blk[plevel].bp == NULL || + (plevel < path->active && + altpath->blk[plevel].bp == path->blk[plevel].bp)) + continue; + + xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp); + altpath->blk[plevel].bp = NULL; + } + return error; } @@ -460,7 +468,7 @@ xchk_da_btree( int error; /* Skip short format data structures; no btree to scan. */ - if (!xfs_ifork_has_extents(sc->ip, whichfork)) + if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork))) return 0; /* Set up initial da state. */ @@ -468,9 +476,7 @@ xchk_da_btree( ds.dargs.whichfork = whichfork; ds.dargs.trans = sc->tp; ds.dargs.op_flags = XFS_DA_OP_OKNOENT; - ds.state = xfs_da_state_alloc(); - ds.state->args = &ds.dargs; - ds.state->mp = mp; + ds.state = xfs_da_state_alloc(&ds.dargs); ds.sc = sc; ds.private = private; if (whichfork == XFS_ATTR_FORK) { diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 266da4e4bde6..7c432997edad 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -155,6 +155,9 @@ xchk_dir_actor( xname.type = XFS_DIR3_FT_UNKNOWN; error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + /* ENOENT means the hash lookup failed and the dir is corrupt */ + if (error == -ENOENT) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out; @@ -500,7 +503,7 @@ xchk_directory_leaf1_bestfree( /* Read the free space block. */ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); leaf = bp->b_addr; @@ -565,9 +568,10 @@ xchk_directory_leaf1_bestfree( xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + break; } out: + xfs_trans_brelse(sc->tp, bp); return error; } @@ -589,7 +593,7 @@ xchk_directory_free_bestfree( /* Read the free space block */ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { @@ -612,7 +616,7 @@ xchk_directory_free_bestfree( 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - break; + goto out; xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -620,6 +624,7 @@ xchk_directory_free_bestfree( if (freehdr.nused + stale != freehdr.nvalid) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); out: + xfs_trans_brelse(sc->tp, bp); return error; } @@ -630,7 +635,7 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; xfs_fileoff_t free_lblk; @@ -642,11 +647,10 @@ xchk_directory_blocks( int error; /* Ignore local format directories. */ - if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS && + ifp->if_format != XFS_DINODE_FMT_BTREE) return 0; - ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET); leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET); free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 681758704fda..6517d67e8d51 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,7 +104,7 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -164,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -215,7 +215,7 @@ xchk_iallocbt_check_cluster( struct xfs_dinode *dip; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -278,8 +278,7 @@ xchk_iallocbt_check_cluster( &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, - 0, 0); + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, 0); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) return error; @@ -426,7 +425,7 @@ xchk_iallocbt_rec( struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 5705adc43a75..855aa8bcab64 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -90,7 +90,7 @@ xchk_parent_count_parent_dentries( * if there is one. */ lock_mode = xfs_ilock_data_map_shared(parent); - if (parent->i_d.di_nextents > 0) + if (parent->i_df.if_nextents > 0) error = xfs_dir3_data_readahead(parent, 0, 0); xfs_iunlock(parent, lock_mode); if (error) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 905a34558361..e34ca20ae8e4 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -18,17 +18,17 @@ #include "scrub/common.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline uint +static inline xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { switch (sc->sm->sm_type) { case XFS_SCRUB_TYPE_UQUOTA: - return XFS_DQ_USER; + return XFS_DQTYPE_USER; case XFS_SCRUB_TYPE_GQUOTA: - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; case XFS_SCRUB_TYPE_PQUOTA: - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; default: return 0; } @@ -40,7 +40,7 @@ xchk_setup_quota( struct xfs_scrub *sc, struct xfs_inode *ip) { - uint dqtype; + xfs_dqtype_t dqtype; int error; if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) @@ -73,26 +73,15 @@ struct xchk_quota_info { STATIC int xchk_quota_item( struct xfs_dquot *dq, - uint dqtype, + xfs_dqtype_t dqtype, void *priv) { struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_fileoff_t offset; - unsigned long long bsoft; - unsigned long long isoft; - unsigned long long rsoft; - unsigned long long bhard; - unsigned long long ihard; - unsigned long long rhard; - unsigned long long bcount; - unsigned long long icount; - unsigned long long rcount; xfs_ino_t fs_icount; - xfs_dqid_t id = be32_to_cpu(d->d_id); int error = 0; if (xchk_should_terminate(sc, &error)) @@ -102,27 +91,11 @@ xchk_quota_item( * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ - offset = id / qi->qi_dqperchunk; - if (id && id <= sqi->last_id) + offset = dq->q_id / qi->qi_dqperchunk; + if (dq->q_id && dq->q_id <= sqi->last_id) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - sqi->last_id = id; - - /* Did we get the dquot type we wanted? */ - if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - - if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - - /* Check the limits. */ - bhard = be64_to_cpu(d->d_blk_hardlimit); - ihard = be64_to_cpu(d->d_ino_hardlimit); - rhard = be64_to_cpu(d->d_rtb_hardlimit); - - bsoft = be64_to_cpu(d->d_blk_softlimit); - isoft = be64_to_cpu(d->d_ino_softlimit); - rsoft = be64_to_cpu(d->d_rtb_softlimit); + sqi->last_id = dq->q_id; /* * Warn if the hard limits are larger than the fs. @@ -132,25 +105,22 @@ xchk_quota_item( * Complain about corruption if the soft limit is greater than * the hard limit. */ - if (bhard > mp->m_sb.sb_dblocks) + if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (bsoft > bhard) + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (ihard > M_IGEO(mp)->maxicount) + if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (isoft > ihard) + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (rhard > mp->m_sb.sb_rblocks) + if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (rsoft > rhard) + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* Check the resource counts. */ - bcount = be64_to_cpu(d->d_bcount); - icount = be64_to_cpu(d->d_icount); - rcount = be64_to_cpu(d->d_rtbcount); fs_icount = percpu_counter_sum(&mp->m_icount); /* @@ -159,15 +129,15 @@ xchk_quota_item( * if there are no quota limits. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { - if (mp->m_sb.sb_dblocks < bcount) + if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); } else { - if (mp->m_sb.sb_dblocks < bcount) + if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } - if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks) + if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* @@ -175,13 +145,22 @@ xchk_quota_item( * lower limit than the actual usage. However, we flag it for * admin review. */ - if (id != 0 && bhard != 0 && bcount > bhard) + if (dq->q_id == 0) + goto out; + + if (dq->q_blk.hardlimit != 0 && + dq->q_blk.count > dq->q_blk.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (id != 0 && ihard != 0 && icount > ihard) + + if (dq->q_ino.hardlimit != 0 && + dq->q_ino.count > dq->q_ino.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (id != 0 && rhard != 0 && rcount > rhard) + + if (dq->q_rtb.hardlimit != 0 && + dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); +out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -EFSCORRUPTED; @@ -235,7 +214,7 @@ xchk_quota( struct xchk_quota_info sqi; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; - uint dqtype; + xfs_dqtype_t dqtype; int error = 0; dqtype = xchk_quota_to_dqtype(sc); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 0cab11a5d390..beaeb6fa3119 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -336,7 +336,7 @@ xchk_refcountbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index e489d7a8446a..25e86c71e7b9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -208,8 +208,10 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); if (!error) { - aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); - freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks); + struct xfs_agf *agf = bp->b_addr; + + aglen = be32_to_cpu(agf->agf_length); + freelen = be32_to_cpu(agf->agf_freeblks); usedlen = aglen - freelen; xfs_buf_relse(bp); } @@ -434,10 +436,10 @@ xrep_init_btblock( int xrep_invalidate_blocks( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap) + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -449,7 +451,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; @@ -595,18 +597,18 @@ out_free: int xrep_reap_extents( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap, + struct xbitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; xfs_fsblock_t fsbno; int error = 0; ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno); trace_xrep_dispose_btree_extent(sc->mp, @@ -615,11 +617,9 @@ xrep_reap_extents( error = xrep_reap_block(sc, fsbno, oinfo, type); if (error) - goto out; + break; } -out: - xfs_bitmap_destroy(bitmap); return error; } @@ -879,7 +879,7 @@ xrep_find_ag_btree_roots( ri.sc = sc; ri.btree_info = btree_info; - ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agf = agf_bp->b_addr; ri.agfl_bp = agfl_bp; for (fab = btree_info; fab->buf_ops; fab++) { ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); @@ -899,11 +899,11 @@ xrep_find_ag_btree_roots( void xrep_force_quotacheck( struct xfs_scrub *sc, - uint dqtype) + xfs_dqtype_t type) { uint flag; - flag = xfs_quota_chkd_flag(dqtype); + flag = xfs_quota_chkd_flag(type); if (!(flag & sc->mp->m_qflags)) return; @@ -939,11 +939,11 @@ xrep_ino_dqattach( "inode %llu repair encountered quota error %d, quotacheck forced.", (unsigned long long)sc->ip->i_ino, error); if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) - xrep_force_quotacheck(sc, XFS_DQ_USER); + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) - xrep_force_quotacheck(sc, XFS_DQ_GROUP); + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) - xrep_force_quotacheck(sc, XFS_DQ_PROJ); + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); /* fall through */ case -ESRCH: error = 0; diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index c3422403b169..fe77de01abe0 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_REPAIR_H__ #define __XFS_SCRUB_REPAIR_H__ +#include "xfs_quota_defs.h" + static inline int xrep_notsupported(struct xfs_scrub *sc) { return -EOPNOTSUPP; @@ -28,11 +30,11 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, struct xfs_buf **bpp, xfs_btnum_t btnum, const struct xfs_buf_ops *ops); -struct xfs_bitmap; +struct xbitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist, +int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); +int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { @@ -49,7 +51,7 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); -void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype); +void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); /* Metadata repairers */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8d4cefd761c1..f4fcb4719f41 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; bool non_inode; bool is_unwritten; bool is_bmbt; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index c642bc206c41..76e4ffe0315b 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -13,6 +13,7 @@ #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -58,6 +59,41 @@ xchk_rtbitmap_rec( return 0; } +/* Make sure the entire rtbitmap file is mapped with written extents. */ +STATIC int +xchk_rtbitmap_check_extents( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_bmbt_irec map; + xfs_rtblock_t off; + int nmap; + int error = 0; + + for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rbmip, off, + mp->m_sb.sb_rbmblocks - off, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + break; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + break; + } + + off += map.br_blockcount; + } + + return error; +} + /* Scrub the realtime bitmap. */ int xchk_rtbitmap( @@ -65,11 +101,22 @@ xchk_rtbitmap( { int error; + /* Is the size of the rtbitmap correct? */ + if (sc->mp->m_rbmip->i_d.di_size != + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { + xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + return 0; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; + error = xchk_rtbitmap_check_extents(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index f1775bb19313..8ebf35b115ce 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -168,6 +168,7 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } + sb_end_write(sc->mp->m_super); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { @@ -490,6 +491,14 @@ xfs_scrub_metadata( sc.ops = &meta_scrub_ops[sm->sm_type]; sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: + /* + * If freeze runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the filesystem and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while checking is running. + */ + sb_start_write(mp->m_super); + /* Set up for the operation. */ error = sc.ops->setup(&sc, ip); if (error) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 9eaab2eb5ed3..2c6c248be823 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -24,9 +24,9 @@ xchk_btree_cur_fsbno( return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); else if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) - return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino); + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0); + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 096203119934..e46f5cef90da 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -379,7 +379,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; @@ -459,7 +459,7 @@ TRACE_EVENT(xchk_ifork_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cd743fad8478..d4c687b5cd06 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -14,6 +14,8 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> @@ -67,10 +69,12 @@ xfs_acl_from_disk( switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + acl_e->e_uid = make_kuid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_GROUP: - acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + acl_e->e_gid = make_kgid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: @@ -103,10 +107,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) ace->ae_tag = cpu_to_be32(acl_e->e_tag); switch (acl_e->e_tag) { case ACL_USER: - ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + ace->ae_id = cpu_to_be32( + from_kuid(&init_user_ns, acl_e->e_uid)); break; case ACL_GROUP: - ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + ace->ae_id = cpu_to_be32( + from_kgid(&init_user_ns, acl_e->e_gid)); break; default: ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); @@ -120,102 +126,86 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) struct posix_acl * xfs_get_acl(struct inode *inode, int type) { - struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl = NULL; - unsigned char *ea_name; - int error; - int len; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct posix_acl *acl = NULL; + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + .valuelen = XFS_ACL_MAX_SIZE(mp), + }; + int error; trace_xfs_get_acl(ip); switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: BUG(); } + args.namelen = strlen(args.name); /* - * If we have a cached ACLs value just return it, not need to - * go out to the disk. + * If the attribute doesn't exist make sure we have a negative cache + * entry, for any other error assume it is transient. */ - len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, strlen(ea_name), - (unsigned char **)&xfs_acl, &len, - ATTR_ALLOC | ATTR_ROOT); - if (error) { - /* - * If the attribute doesn't exist make sure we have a negative - * cache entry, for any other error assume it is transient. - */ - if (error != -ENOATTR) - acl = ERR_PTR(error); - } else { - acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len, - XFS_ACL_MAX_ENTRIES(ip->i_mount)); - kmem_free(xfs_acl); + error = xfs_attr_get(&args); + if (!error) { + acl = xfs_acl_from_disk(mp, args.value, args.valuelen, + XFS_ACL_MAX_ENTRIES(mp)); + } else if (error != -ENOATTR) { + acl = ERR_PTR(error); } + + kmem_free(args.value); return acl; } int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct xfs_inode *ip = XFS_I(inode); - unsigned char *ea_name; - int error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + }; + int error; switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: return -EINVAL; } + args.namelen = strlen(args.name); if (acl) { - struct xfs_acl *xfs_acl; - int len = XFS_ACL_MAX_SIZE(ip->i_mount); - - xfs_acl = kmem_zalloc_large(len, 0); - if (!xfs_acl) + args.valuelen = XFS_ACL_SIZE(acl->a_count); + args.value = kmem_zalloc_large(args.valuelen, 0); + if (!args.value) return -ENOMEM; - - xfs_acl_to_disk(xfs_acl, acl); - - /* subtract away the unused acl entries */ - len -= sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - - error = xfs_attr_set(ip, ea_name, strlen(ea_name), - (unsigned char *)xfs_acl, len, ATTR_ROOT); - - kmem_free(xfs_acl); - } else { - /* - * A NULL ACL argument means we want to remove the ACL. - */ - error = xfs_attr_remove(ip, ea_name, - strlen(ea_name), - ATTR_ROOT); - - /* - * If the attribute didn't exist to start with that's fine. - */ - if (error == -ENOATTR) - error = 0; + xfs_acl_to_disk(args.value, acl); } + error = xfs_attr_set(&args); + kmem_free(args.value); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (!acl && error == -ENOATTR) + error = 0; if (!error) set_cached_acl(inode, type, acl); return error; @@ -275,3 +265,19 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; } + +/* + * Invalidate any cached ACLs if the user has bypassed the ACL interface. + * We don't validate the content whatsoever so it is caller responsibility to + * provide data in valid format and ensure i_mode is consistent. + */ +void +xfs_forget_acl( + struct inode *inode, + const char *name) +{ + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(inode, ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(inode, ACL_TYPE_DEFAULT); +} diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 94615e34bc86..c042c0868016 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -13,14 +13,16 @@ struct posix_acl; extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +void xfs_forget_acl(struct inode *inode, const char *name); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } # define xfs_set_acl NULL +static inline void xfs_forget_acl(struct inode *inode, const char *name) +{ +} #endif /* CONFIG_XFS_POSIX_ACL */ -extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); - #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 58e937be24ce..b35611882ff9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -382,7 +382,7 @@ xfs_map_blocks( */ retry: xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); /* @@ -539,7 +539,7 @@ xfs_discard_page( if (XFS_FORCED_SHUTDOWN(mp)) goto out_invalidate; - xfs_alert(mp, + xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index bbfa6ba84dcd..bfad669e6b2f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -145,8 +145,8 @@ xfs_attr3_node_inactive( * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - xfs_buf_corruption_error(bp); return -EFSCORRUPTED; } @@ -194,7 +194,7 @@ xfs_attr3_node_inactive( error = xfs_attr3_leaf_inactive(trans, dp, child_bp); break; default: - xfs_buf_corruption_error(child_bp); + xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); error = -EFSCORRUPTED; break; @@ -289,7 +289,7 @@ xfs_attr3_root_inactive( break; default: error = -EFSCORRUPTED; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); break; } @@ -367,7 +367,7 @@ xfs_attr_inactive( * removal below. */ if (xfs_inode_hasattr(dp) && - dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,8 +388,11 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_afp) - xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (dp->i_afp) { + xfs_idestroy_fork(dp->i_afp); + kmem_cache_free(xfs_ifork_zone, dp->i_afp); + dp->i_afp = NULL; + } if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d37743bdf274..50f922cad91a 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -44,7 +44,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) /* * Copy out entries of shortform attribute lists for attr_list(). * Shortform attribute lists are not stored in hashval sorted order. - * If the output buffer is not large enough to hold them all, then we + * If the output buffer is not large enough to hold them all, then * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ @@ -52,24 +52,19 @@ static int xfs_attr_shortform_list( struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; + struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; struct xfs_attr_shortform *sf; struct xfs_attr_sf_entry *sfe; - struct xfs_inode *dp; int sbsize, nsbuf, count, i; int error = 0; - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); ASSERT(dp->i_afp != NULL); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; - cursor = context->cursor; - ASSERT(cursor != NULL); trace_xfs_attr_list_sf(context); @@ -205,7 +200,7 @@ out: STATIC int xfs_attr_node_list_lookup( struct xfs_attr_list_context *context, - struct attrlist_cursor_kern *cursor, + struct xfs_attrlist_cursor_kern *cursor, struct xfs_buf **pbp) { struct xfs_da3_icnode_hdr nodehdr; @@ -279,7 +274,7 @@ xfs_attr_node_list_lookup( return 0; out_corruptbuf: - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); return -EFSCORRUPTED; } @@ -288,8 +283,8 @@ STATIC int xfs_attr_node_list( struct xfs_attr_list_context *context) { + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr3_icleaf_hdr leafhdr; - struct attrlist_cursor_kern *cursor; struct xfs_attr_leafblock *leaf; struct xfs_da_intnode *node; struct xfs_buf *bp; @@ -299,7 +294,6 @@ xfs_attr_node_list( trace_xfs_attr_node_list(context); - cursor = context->cursor; cursor->initted = 1; /* @@ -394,7 +388,7 @@ xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; @@ -408,7 +402,6 @@ xfs_attr3_leaf_list_int( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - cursor = context->cursor; cursor->initted = 1; /* @@ -452,8 +445,8 @@ xfs_attr3_leaf_list_int( } if ((entry->flags & XFS_ATTR_INCOMPLETE) && - !(context->flags & ATTR_INCOMPLETE)) - continue; /* skip incomplete entries */ + !context->allow_incomplete) + continue; if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc; @@ -488,14 +481,15 @@ xfs_attr3_leaf_list_int( * Copy out attribute entries for attr_list(), for leaf attribute lists. */ STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) +xfs_attr_leaf_list( + struct xfs_attr_list_context *context) { - int error; - struct xfs_buf *bp; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_list(context); - context->cursor->blkno = 0; + context->cursor.blkno = 0; error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); if (error) return error; @@ -506,7 +500,7 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) } int -xfs_attr_list_int_ilocked( +xfs_attr_list_ilocked( struct xfs_attr_list_context *context) { struct xfs_inode *dp = context->dp; @@ -518,20 +512,20 @@ xfs_attr_list_int_ilocked( */ if (!xfs_inode_hasattr(dp)) return 0; - else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); - else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) return xfs_attr_leaf_list(context); return xfs_attr_node_list(context); } int -xfs_attr_list_int( - xfs_attr_list_context_t *context) +xfs_attr_list( + struct xfs_attr_list_context *context) { - int error; - xfs_inode_t *dp = context->dp; - uint lock_mode; + struct xfs_inode *dp = context->dp; + uint lock_mode; + int error; XFS_STATS_INC(dp->i_mount, xs_attr_list); @@ -539,130 +533,7 @@ xfs_attr_list_int( return -EIO; lock_mode = xfs_ilock_attr_map_shared(dp); - error = xfs_attr_list_int_ilocked(context); + error = xfs_attr_list_ilocked(context); xfs_iunlock(dp, lock_mode); return error; } - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \ - & ~(sizeof(uint32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -STATIC void -xfs_attr_put_listent( - xfs_attr_list_context_t *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) -{ - struct attrlist *alist = (struct attrlist *)context->alist; - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!context->seen_enough); - ASSERT(!(context->flags & ATTR_KERNOVAL)); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (((context->flags & ATTR_SECURE) == 0) != - ((flags & XFS_ATTR_SECURE) == 0)) - return; - if (((context->flags & ATTR_ROOT) == 0) != - ((flags & XFS_ATTR_ROOT) == 0)) - return; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return; - } - - aep = (attrlist_ent_t *)&context->alist[context->firstu]; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); - return; -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list( - xfs_inode_t *dp, - char *buffer, - int bufsize, - int flags, - attrlist_cursor_kern_t *cursor) -{ - xfs_attr_list_context_t context; - struct attrlist *alist; - int error; - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return -EINVAL; - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return -EINVAL; - - /* Only internal consumers can retrieve incomplete attrs. */ - if (flags & ATTR_INCOMPLETE) - return -EINVAL; - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return -EFAULT; - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - memset(&context, 0, sizeof(context)); - context.dp = dp; - context.cursor = cursor; - context.resynch = 1; - context.flags = flags; - context.alist = buffer; - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.put_listent = xfs_attr_put_listent; - - alist = (struct attrlist *)context.alist; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list_int(&context); - ASSERT(error <= 0); - return error; -} diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ee6f4229cebc..ec3691372e7c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -22,16 +22,20 @@ #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; +static const struct xfs_item_ops xfs_bui_item_ops; + static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bui_log_item, bui_item); } -void +STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { @@ -45,13 +49,13 @@ xfs_bui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the BUI. */ -void +STATIC void xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); xfs_bui_item_free(buip); } } @@ -124,24 +128,17 @@ xfs_bui_item_release( xfs_bui_release(BUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_bui_item_ops = { - .iop_size = xfs_bui_item_size, - .iop_format = xfs_bui_item_format, - .iop_unpin = xfs_bui_item_unpin, - .iop_release = xfs_bui_item_release, -}; - /* * Allocate and initialize an bui item with the given number of extents. */ -struct xfs_bui_log_item * +STATIC struct xfs_bui_log_item * xfs_bui_init( struct xfs_mount *mp) { struct xfs_bui_log_item *buip; - buip = kmem_zone_zalloc(xfs_bui_zone, 0); + buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -218,7 +215,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_zone_zalloc(xfs_bud_zone, 0); + budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; @@ -278,27 +275,6 @@ xfs_bmap_update_diff_items( return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* Get an BUI. */ -STATIC void * -xfs_bmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_bui_log_item *buip; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - ASSERT(tp != NULL); - - buip = xfs_bui_init(tp->t_mountp); - ASSERT(buip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &buip->bui_item); - return buip; -} - /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( @@ -326,16 +302,12 @@ xfs_trans_set_bmap_flags( STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_bui_log_item *buip, + struct xfs_bmap_intent *bmap) { - struct xfs_bui_log_item *buip = intent; - struct xfs_bmap_intent *bmap; uint next_extent; struct xfs_map_extent *map; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); @@ -355,23 +327,44 @@ xfs_bmap_update_log_item( bmap->bi_bmap.br_state); } +static struct xfs_log_item * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_bui_log_item *buip = xfs_bui_init(mp); + struct xfs_bmap_intent *bmap; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + + xfs_trans_add_item(tp, &buip->bui_item); + if (sort) + list_sort(mp, items, xfs_bmap_update_diff_items); + list_for_each_entry(bmap, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bmap); + return &buip->bui_item; +} + /* Get an BUD so we can process all the deferred rmap updates. */ -STATIC void * +static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_bud(tp, intent); + return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; } /* Process a deferred rmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_bmap_intent *bmap; xfs_filblks_t count; @@ -379,7 +372,7 @@ xfs_bmap_update_finish_item( bmap = container_of(item, struct xfs_bmap_intent, bi_list); count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, done_item, + error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bmap->bi_type, bmap->bi_owner, bmap->bi_whichfork, bmap->bi_bmap.br_startoff, @@ -398,9 +391,9 @@ xfs_bmap_update_finish_item( /* Abort all pending BUIs. */ STATIC void xfs_bmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_bui_release(intent); + xfs_bui_release(BUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -416,10 +409,8 @@ xfs_bmap_update_cancel_item( const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, - .log_item = xfs_bmap_update_log_item, .create_done = xfs_bmap_update_create_done, .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, @@ -429,32 +420,30 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ -int -xfs_bui_recover( - struct xfs_trans *parent_tp, - struct xfs_bui_log_item *buip) +STATIC int +xfs_bui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int error = 0; - unsigned int bui_type; + struct xfs_bmbt_irec irec; + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_mount *mp = parent_tp->t_mountp; struct xfs_map_extent *bmap; + struct xfs_bud_log_item *budp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; - bool op_ok; - struct xfs_bud_log_item *budp; + xfs_exntst_t state; enum xfs_bmap_intent_type type; + bool op_ok; + unsigned int bui_type; int whichfork; - xfs_exntst_t state; - struct xfs_trans *tp; - struct xfs_inode *ip = NULL; - struct xfs_bmbt_irec irec; - struct xfs_mount *mp = parent_tp->t_mountp; - - ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + int error = 0; /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EFSCORRUPTED; } @@ -488,7 +477,6 @@ xfs_bui_recover( * This will pull the BUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EFSCORRUPTED; } @@ -546,7 +534,6 @@ xfs_bui_recover( xfs_bmap_unmap_extent(tp, ip, &irec); } - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_defer_move(parent_tp, tp); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -563,3 +550,121 @@ err_inode: } return error; } + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_unpin = xfs_bui_item_unpin, + .iop_release = xfs_bui_item_release, + .iop_recover = xfs_bui_item_recover, + .iop_match = xfs_bui_item_match, +}; + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + +const struct xlog_recover_item_ops xlog_bui_item_ops = { + .item_type = XFS_LI_BUI, + .commit_pass2 = xlog_recover_bui_commit_pass2, +}; + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_format *bud_formatp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_BUI, bud_formatp->bud_bui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_bud_item_ops = { + .item_type = XFS_LI_BUD, + .commit_pass2 = xlog_recover_bud_commit_pass2, +}; diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index ad479cc73de8..b9be62f8bd52 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -33,11 +33,6 @@ struct kmem_zone; #define XFS_BUI_MAX_FAST_EXTENTS 1 /* - * Define BUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_BUI_RECOVERED 1 - -/* * This is the "bmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "bmap update done" log item described below. @@ -49,7 +44,6 @@ struct xfs_bui_log_item { struct xfs_log_item bui_item; atomic_t bui_refcount; atomic_t bui_next_extent; - unsigned long bui_flags; /* misc flags */ struct xfs_bui_log_format bui_format; }; @@ -74,9 +68,4 @@ struct xfs_bud_log_item { extern struct kmem_zone *xfs_bui_zone; extern struct kmem_zone *xfs_bud_zone; -struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); -void xfs_bui_item_free(struct xfs_bui_log_item *); -void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip); - #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e62fb5216341..73cafc843cd7 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -126,7 +126,7 @@ xfs_bmap_rtalloc( * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->offset == 0) { - xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + xfs_rtblock_t rtx; /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) @@ -223,7 +223,7 @@ xfs_bmap_count_blocks( if (!ifp) return 0; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -449,7 +449,7 @@ xfs_getbmap( break; } - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; @@ -1062,7 +1062,6 @@ xfs_collapse_file_space( int error; xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); - uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); bool done = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -1078,32 +1077,34 @@ xfs_collapse_file_space( if (error) return error; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) - break; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + while (!done) { error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, &done); if (error) goto out_trans_cancel; + if (done) + break; - error = xfs_trans_commit(tp); + /* finish any deferred frees and roll the transaction */ + error = xfs_defer_finish(&tp); + if (error) + goto out_trans_cancel; } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1146,35 +1147,41 @@ xfs_insert_file_space( if (error) return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at * stop_fsb. */ - error = xfs_bmap_split_extent(ip, stop_fsb); + error = xfs_bmap_split_extent(tp, ip, stop_fsb); if (error) - return error; + goto out_trans_cancel; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, - &tp); + do { + error = xfs_trans_roll_inode(&tp, ip); if (error) - break; + goto out_trans_cancel; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, &done, stop_fsb); if (error) goto out_trans_cancel; + } while (!done); - error = xfs_trans_commit(tp); - } - + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1203,17 +1210,26 @@ xfs_swap_extents_check_format( struct xfs_inode *ip, /* target inode */ struct xfs_inode *tip) /* tmp inode */ { + struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *tifp = &tip->i_df; + + /* User/group/project quota ids must match if quotas are enforced. */ + if (XFS_IS_QUOTA_ON(ip->i_mount) && + (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) || + !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) || + ip->i_d.di_projid != tip->i_d.di_projid)) + return -EINVAL; /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + tifp->if_format == XFS_DINODE_FMT_LOCAL) return -EINVAL; /* * if the target inode has less extents that then temporary inode then * why did userspace call us? */ - if (ip->i_d.di_nextents < tip->i_d.di_nextents) + if (ifp->if_nextents < tifp->if_nextents) return -EINVAL; /* @@ -1228,20 +1244,18 @@ xfs_swap_extents_check_format( * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && + tifp->if_format == XFS_DINODE_FMT_BTREE) return -EINVAL; /* Check temp in extent form to max in target */ - if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + if (tifp->if_format == XFS_DINODE_FMT_EXTENTS && + tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; /* Check target in extent form to max in temp */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; /* @@ -1253,22 +1267,20 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(ip) && - XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; - if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; } @@ -1420,15 +1432,15 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && - (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && + ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && - (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && + tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) @@ -1442,12 +1454,12 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*src_log_flags) |= XFS_ILOG_DOWNER; + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) + (*target_log_flags) |= XFS_ILOG_DOWNER; + if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) + (*src_log_flags) |= XFS_ILOG_DOWNER; + } /* * Swap the data forks of the inodes @@ -1461,9 +1473,6 @@ xfs_swap_extent_forks( ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - swap(ip->i_d.di_nextents, tip->i_d.di_nextents); - swap(ip->i_d.di_format, tip->i_d.di_format); - /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified @@ -1477,24 +1486,24 @@ xfs_swap_extent_forks( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; } - switch (tip->i_d.di_format) { + switch (tip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(tip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } @@ -1558,6 +1567,7 @@ xfs_swap_extents( int lock_flags; uint64_t f; int resblks = 0; + unsigned int flags = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1599,7 +1609,7 @@ xfs_swap_extents( if (xfs_inode_has_cow_data(tip)) { error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); if (error) - return error; + goto out_unlock; } /* @@ -1608,9 +1618,9 @@ xfs_swap_extents( * performed with log redo items! */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - int w = XFS_DATA_FORK; - uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w); - uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w); + int w = XFS_DATA_FORK; + uint32_t ipnext = ip->i_df.if_nextents; + uint32_t tipnext = tip->i_df.if_nextents; /* * Conceptually this shouldn't affect the shape of either bmbt, @@ -1621,17 +1631,16 @@ xfs_swap_extents( resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); /* - * Handle the corner case where either inode might straddle the - * btree format boundary. If so, the inode could bounce between - * btree <-> extent format on unmap -> remap cycles, freeing and - * allocating a bmapbt block each time. + * If either inode straddles a bmapbt block allocation boundary, + * the rmapbt algorithm triggers repeated allocs and frees as + * extents are remapped. This can exhaust the block reservation + * prematurely and cause shutdown. Return freed blocks to the + * transaction reservation to counter this behavior. */ - if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(ip, w); - if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(tip, w); + flags |= XFS_TRANS_RES_FDBLKS; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, + &tp); if (error) goto out_unlock; @@ -1710,10 +1719,11 @@ xfs_swap_extents( /* Swap the cow forks. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { - ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(!ip->i_cowfp || + ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); + ASSERT(!tip->i_cowfp || + tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); - swap(ip->i_cnextents, tip->i_cnextents); swap(ip->i_cowfp, tip->i_cowfp); if (ip->i_cowfp && ip->i_cowfp->if_bytes) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 217e4f82a44a..d4cdcb6fb2fe 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,9 @@ #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_recover.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -211,9 +214,7 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); - if (unlikely(!bp)) - return -ENOMEM; + bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are @@ -327,6 +328,9 @@ xfs_buf_free( __free_page(page); } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += + bp->b_page_count; } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); @@ -474,7 +478,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); @@ -652,7 +656,6 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; } @@ -727,8 +730,9 @@ found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pagesn", __func__); + xfs_warn_ratelimited(target->bt_mount, + "%s: failed to map %u pages", __func__, + bp->b_page_count); xfs_buf_relse(bp); return error; } @@ -1187,21 +1191,39 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - /* Only validate buffers that were read without errors */ - if (read && !bp->b_error && bp->b_ops) { - ASSERT(!bp->b_iodone); - bp->b_ops->verify_read(bp); + if (read) { + if (!bp->b_error && bp->b_ops) + bp->b_ops->verify_read(bp); + if (!bp->b_error) + bp->b_flags |= XBF_DONE; + xfs_buf_ioend_finish(bp); + return; } - if (!bp->b_error) + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; bp->b_flags |= XBF_DONE; + } - if (bp->b_iodone) - (*(bp->b_iodone))(bp); - else if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); + /* + * If this is a log recovery buffer, we aren't doing transactional IO + * yet so we need to let it handle IO completions. + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xlog_recover_iodone(bp); + return; + } + + if (bp->b_flags & _XBF_INODES) { + xfs_buf_inode_iodone(bp); + return; + } + + if (bp->b_flags & _XBF_DQUOTS) { + xfs_buf_dquot_iodone(bp); + return; + } + xfs_buf_iodone(bp); } static void @@ -1238,10 +1260,26 @@ xfs_buf_ioerror_alert( struct xfs_buf *bp, xfs_failaddr_t func) { - xfs_alert(bp->b_mount, -"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", - func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, - -bp->b_error); + xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", + "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", + func, (uint64_t)XFS_BUF_ADDR(bp), + bp->b_length, -bp->b_error); +} + +/* + * To simulate an I/O failure, the buffer must be locked and held with at least + * three references. The LRU reference is dropped by the stale call. The buf + * item reference is dropped via ioend processing. The third reference is owned + * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. + */ +void +xfs_buf_ioend_fail( + struct xfs_buf *bp) +{ + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); } int @@ -1254,7 +1292,7 @@ xfs_bwrite( bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | - XBF_WRITE_FAIL | XBF_DONE); + XBF_DONE); error = xfs_buf_submit(bp); if (error) @@ -1268,6 +1306,11 @@ xfs_buf_bio_end_io( { struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; + if (!bio->bi_status && + (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + bio->bi_status = BLK_STS_IOERR; + /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. @@ -1476,10 +1519,7 @@ __xfs_buf_submit( /* on shutdown we stale and complete the buffer immediately */ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + xfs_buf_ioend_fail(bp); return -EIO; } @@ -1573,6 +1613,28 @@ xfs_buf_zero( } /* + * Log a message about and stale a buffer that a caller has decided is corrupt. + * + * This function should be called for the kinds of metadata corruption that + * cannot be detect from a verifier, such as incorrect inter-block relationship + * data. Do /not/ call this function from a verifier function. + * + * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will + * be marked stale, but b_error will not be set. The caller is responsible for + * releasing the buffer or fixing it. + */ +void +__xfs_buf_mark_corrupt( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + ASSERT(bp->b_flags & XBF_DONE); + + xfs_buf_corruption_error(bp, fa); + xfs_buf_stale(bp); +} + +/* * Handling of buffer targets (buftargs). */ @@ -1616,7 +1678,8 @@ xfs_wait_buftarg( struct xfs_buftarg *btp) { LIST_HEAD(dispose); - int loop = 0; + int loop = 0; + bool write_fail = false; /* * First wait on the buftarg I/O count for all in-flight buffers to be @@ -1644,17 +1707,29 @@ xfs_wait_buftarg( bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { - xfs_alert(btp->bt_mount, + write_fail = true; + xfs_buf_alert_ratelimited(bp, + "XFS: Corruption Alert", "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)bp->b_bn); - xfs_alert(btp->bt_mount, -"Please run xfs_repair to determine the extent of the problem."); } xfs_buf_rele(bp); } if (loop++ != 0) delay(100); } + + /* + * If one or more failed buffers were freed, that means dirty metadata + * was thrown away. This should only ever happen after I/O completion + * handling has elevated I/O error(s) to permanent failures and shuts + * down the fs. + */ + if (write_fail) { + ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount)); + xfs_alert(btp->bt_mount, + "Please run xfs_repair to determine the extent of the problem."); + } } static enum lru_status @@ -1787,6 +1862,13 @@ xfs_alloc_buftarg( btp->bt_bdev = bdev; btp->bt_daxdev = dax_dev; + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + if (xfs_setsize_buftarg_early(btp, bdev)) goto error_free; @@ -1957,7 +2039,7 @@ xfs_buf_delwri_submit_buffers( * synchronously. Otherwise, drop the buffer from the delwri * queue and submit async. */ - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags &= ~_XBF_DELWRI_Q; bp->b_flags |= XBF_WRITE; if (wait_list) { bp->b_flags &= ~XBF_ASYNC; @@ -2091,9 +2173,11 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_cache_create("xfs_buf", - sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); if (!xfs_buf_zone) goto out; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d79a1fe5d738..755b652e695a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -18,6 +18,7 @@ /* * Base types */ +struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) @@ -30,15 +31,20 @@ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ #define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ -/* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +/* buffer type flags for write callbacks */ +#define _XBF_INODES (1 << 16)/* inode buffer */ +#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +/* flags used only as arguments to access routines */ +#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ + typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -50,12 +56,15 @@ typedef unsigned int xfs_buf_flags_t; { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ - { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ + { _XBF_INODES, "INODES" }, \ + { _XBF_DQUOTS, "DQUOTS" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" } - + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + /* The following interface flags should never be set */ \ + { XBF_TRYLOCK, "TRYLOCK" }, \ + { XBF_UNMAPPED, "UNMAPPED" } /* * Internal state flags. @@ -91,12 +100,9 @@ typedef struct xfs_buftarg { struct list_lru bt_lru; struct percpu_counter bt_io_count; + struct ratelimit_state bt_ioerror_rl; } xfs_buftarg_t; -struct xfs_buf; -typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); - - #define XB_PAGES 2 struct xfs_buf_map { @@ -149,7 +155,6 @@ typedef struct xfs_buf { xfs_buftarg_t *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; - xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ @@ -256,13 +261,28 @@ extern void xfs_buf_unlock(xfs_buf_t *); #define xfs_buf_islocked(bp) \ ((bp)->b_sema.count <= 0) +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); +static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) +{ + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); +} + extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); +void xfs_buf_ioend_fail(struct xfs_buf *); extern int __xfs_buf_submit(struct xfs_buf *bp, bool); static inline int xfs_buf_submit(struct xfs_buf *bp) @@ -272,6 +292,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp) } void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); +void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); +#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); @@ -320,12 +342,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) return atomic_read(&bp->b_pin_count); } -static inline void xfs_buf_relse(xfs_buf_t *bp) -{ - xfs_buf_unlock(bp); - xfs_buf_rele(bp); -} - static inline int xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 663810e6cd59..408d1b572d3f 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -12,8 +12,13 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" -#include "xfs_buf_item.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -25,7 +30,7 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +static void xfs_buf_item_done(struct xfs_buf *bp); /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool @@ -122,7 +127,7 @@ xfs_buf_item_size_segment( * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged * in a single iovec. * - * Discontiguous buffers need a format structure per region that that is being + * Discontiguous buffers need a format structure per region that is being * logged. This makes the changes in the buffer appear to log recovery as though * they came from separate buffers, just like would occur if multiple buffers * were used instead of a single discontiguous buffer. This enables @@ -345,7 +350,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -410,7 +415,6 @@ xfs_buf_item_unpin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); xfs_buf_t *bp = bip->bli_buf; - struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; int freed; @@ -452,58 +456,33 @@ xfs_buf_item_unpin( } /* - * If we get called here because of an IO error, we may - * or may not have the item on the AIL. xfs_trans_ail_delete() - * will take care of that situation. - * xfs_trans_ail_delete() drops the AIL lock. + * If we get called here because of an IO error, we may or may + * not have the item on the AIL. xfs_trans_ail_delete() will + * take care of that situation. xfs_trans_ail_delete() drops + * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_do_callbacks(bp); - bp->b_log_item = NULL; - list_del_init(&bp->b_li_list); - bp->b_iodone = NULL; + xfs_buf_item_done(bp); + xfs_iflush_done(bp); + ASSERT(list_empty(&bp->b_li_list)); } else { - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); } else if (freed && remove) { /* - * There are currently two references to the buffer - the active - * LRU reference and the buf log item. What we are about to do - * here - simulate a failed IO completion - requires 3 - * references. - * - * The LRU reference is removed by the xfs_buf_stale() call. The - * buf item reference is removed by the xfs_buf_iodone() - * callback that is run by xfs_buf_do_callbacks() during ioend - * processing (via the bp->b_iodone callback), and then finally - * the ioend processing will drop the IO reference if the buffer - * is marked XBF_ASYNC. - * - * Hence we need to take an additional reference here so that IO - * completion processing doesn't free the buffer prematurely. + * The buffer must be locked and held by the caller to simulate + * an async I/O failure. */ xfs_buf_lock(bp); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; - xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + xfs_buf_ioend_fail(bp); } } -/* - * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 - * seconds so as to not spam logs too much on repeated detection of the same - * buffer being bad.. - */ - -static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); - STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -533,11 +512,10 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); /* has a previous flush failed due to IO errors? */ - if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { - xfs_warn(bp->b_mount, -"Failing async write on buffer block 0x%llx. Retrying async write.", - (long long)bp->b_bn); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", + "Failing async write on buffer block 0x%llx. Retrying async write.", + (long long)bp->b_bn); } if (!xfs_buf_delwri_queue(bp, buffer_list)) @@ -584,7 +562,7 @@ xfs_buf_item_put( * state. */ if (aborted) - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, 0); xfs_buf_item_relse(bip->bli_buf); return true; } @@ -760,7 +738,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); + bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; @@ -962,11 +940,7 @@ xfs_buf_item_free( } /* - * This is called when the buf log item is no longer needed. It should - * free the buf log item associated with the given buffer and clear - * the buffer's pointer to the buf log item. If there are no more - * items in the list, clear the b_iodone field of the buffer (see - * xfs_buf_attach_iodone() below). + * xfs_buf_item_relse() is called when the buf log item is no longer needed. */ void xfs_buf_item_relse( @@ -978,137 +952,28 @@ xfs_buf_item_relse( ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; - if (list_empty(&bp->b_li_list)) - bp->b_iodone = NULL; - xfs_buf_rele(bp); xfs_buf_item_free(bip); } - -/* - * Add the given log item with its callback to the list of callbacks - * to be called when the buffer's I/O completes. If it is not set - * already, set the buffer's b_iodone() routine to be - * xfs_buf_iodone_callbacks() and link the log item into the list of - * items rooted at b_li_list. - */ -void -xfs_buf_attach_iodone( - struct xfs_buf *bp, - void (*cb)(struct xfs_buf *, struct xfs_log_item *), - struct xfs_log_item *lip) -{ - ASSERT(xfs_buf_islocked(bp)); - - lip->li_cb = cb; - list_add_tail(&lip->li_bio_list, &bp->b_li_list); - - ASSERT(bp->b_iodone == NULL || - bp->b_iodone == xfs_buf_iodone_callbacks); - bp->b_iodone = xfs_buf_iodone_callbacks; -} - -/* - * We can have many callbacks on a buffer. Running the callbacks individually - * can cause a lot of contention on the AIL lock, so we allow for a single - * callback to be able to scan the remaining items in bp->b_li_list for other - * items of the same type and callback to be processed in the first call. - * - * As a result, the loop walking the callback list below will also modify the - * list. it removes the first item from the list and then runs the callback. - * The loop then restarts from the new first item int the list. This allows the - * callback to scan and modify the list attached to the buffer and we don't - * have to care about maintaining a next item pointer. - */ -STATIC void -xfs_buf_do_callbacks( - struct xfs_buf *bp) -{ - struct xfs_buf_log_item *blip = bp->b_log_item; - struct xfs_log_item *lip; - - /* If there is a buf_log_item attached, run its callback */ - if (blip) { - lip = &blip->bli_item; - lip->li_cb(bp, lip); - } - - while (!list_empty(&bp->b_li_list)) { - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - - /* - * Remove the item from the list, so we don't have any - * confusion if the item is added to another buf. - * Don't touch the log item after calling its - * callback, because it could have freed itself. - */ - list_del_init(&lip->li_bio_list); - lip->li_cb(bp, lip); - } -} - /* - * Invoke the error state callback for each log item affected by the failed I/O. - * - * If a metadata buffer write fails with a non-permanent error, the buffer is - * eventually resubmitted and so the completion callbacks are not run. The error - * state may need to be propagated to the log items attached to the buffer, - * however, so the next AIL push of the item knows hot to handle it correctly. + * Decide if we're going to retry the write after a failure, and prepare + * the buffer for retrying the write. */ -STATIC void -xfs_buf_do_callbacks_fail( - struct xfs_buf *bp) -{ - struct xfs_log_item *lip; - struct xfs_ail *ailp; - - /* - * Buffer log item errors are handled directly by xfs_buf_item_push() - * and xfs_buf_iodone_callback_error, and they have no IO error - * callbacks. Check only for items in b_li_list. - */ - if (list_empty(&bp->b_li_list)) - return; - - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - ailp = lip->li_ailp; - spin_lock(&ailp->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_ops->iop_error) - lip->li_ops->iop_error(lip, bp); - } - spin_unlock(&ailp->ail_lock); -} - static bool -xfs_buf_iodone_callback_error( +xfs_buf_ioerror_fail_without_retry( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - struct xfs_log_item *lip; - struct xfs_mount *mp; + struct xfs_mount *mp = bp->b_mount; static ulong lasttime; static xfs_buftarg_t *lasttarg; - struct xfs_error_cfg *cfg; - - /* - * The failed buffer might not have a buf_log_item attached or the - * log_item list might be empty. Get the mp from the available - * xfs_log_item - */ - lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) - goto out_stale; + return true; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1119,171 +984,240 @@ xfs_buf_iodone_callback_error( /* synchronous writes will have callers process the error */ if (!(bp->b_flags & XBF_ASYNC)) - goto out_stale; - - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - ASSERT(bp->b_iodone != NULL); - - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + return true; + return false; +} - /* - * If the write was asynchronous then no one will be looking for the - * error. If this is the first failure of this type, clear the error - * state and write the buffer out again. This means we always retry an - * async write failure at least once, but we also need to set the buffer - * up to behave correctly now for repeated failures. - */ - if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || - bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; +static bool +xfs_buf_ioerror_retry( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && + bp->b_last_error == bp->b_error) + return false; - xfs_buf_ioerror(bp, 0); - xfs_buf_submit(bp); - return true; - } + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + return true; +} - /* - * Repeated failure on an async write. Take action according to the - * error configuration we have been set up to use. - */ +/* + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. + */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) - goto permanent_error; + return true; if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - goto permanent_error; + return true; /* At unmount we may treat errors differently */ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - goto permanent_error; + return true; - /* - * Still a transient error, run IO completion failure callbacks and let - * the higher layers retry the buffer. - */ - xfs_buf_do_callbacks_fail(bp); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return true; + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Multi-state return value: + * + * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions + * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions + * XBF_IOERROR_FAIL: transient error, run failure callback completions and then + * release the buffer + */ +enum { + XBF_IOERROR_FINISH, + XBF_IOERROR_DONE, + XBF_IOERROR_FAIL, +}; + +static int +xfs_buf_iodone_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + if (xfs_buf_ioerror_fail_without_retry(bp)) + goto out_stale; + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + if (xfs_buf_ioerror_retry(bp, cfg)) { + xfs_buf_ioerror(bp, 0); + xfs_buf_submit(bp); + return XBF_IOERROR_DONE; + } /* * Permanent error - we need to trigger a shutdown if we haven't already * to indicate that inconsistency will result from this action. */ -permanent_error: - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + return XBF_IOERROR_FAIL; + out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; trace_xfs_buf_error_relse(bp, _RET_IP_); - return false; + return XBF_IOERROR_FINISH; } -/* - * This is the iodone() function for buffers which have had callbacks attached - * to them by xfs_buf_attach_iodone(). We need to iterate the items on the - * callback list, mark the buffer as having no more callbacks and then push the - * buffer through IO completion processing. - */ -void -xfs_buf_iodone_callbacks( +static void +xfs_buf_item_done( struct xfs_buf *bp) { - /* - * If there is an error, process it. Some errors require us - * to run callbacks after failure processing is done so we - * detect that and take appropriate action. - */ - if (bp->b_error && xfs_buf_iodone_callback_error(bp)) + struct xfs_buf_log_item *bip = bp->b_log_item; + + if (!bip) return; /* - * Successful IO or permanent error. Either way, we can clear the - * retry state here in preparation for the next error that may occur. + * If we are forcibly shutting down, this may well be off the AIL + * already. That's because we simulate the log-committed callbacks to + * unpin these buffers. Or we may never have put this item on AIL + * because of the transaction was aborted forcibly. + * xfs_trans_ail_delete() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. */ + xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); + bp->b_log_item = NULL; + xfs_buf_item_free(bip); + xfs_buf_rele(bp); +} + +static inline void +xfs_buf_clear_ioerror_retry_state( + struct xfs_buf *bp) +{ bp->b_last_error = 0; bp->b_retries = 0; bp->b_first_retry_time = 0; - - xfs_buf_do_callbacks(bp); - bp->b_log_item = NULL; - list_del_init(&bp->b_li_list); - bp->b_iodone = NULL; - xfs_buf_ioend(bp); } /* - * This is the iodone() function for buffers which have been - * logged. It is called when they are eventually flushed out. - * It should remove the buf item from the AIL, and free the buf item. - * It is called by xfs_buf_iodone_callbacks() above which will take - * care of cleaning up the buffer itself. + * Inode buffer iodone callback function. */ void -xfs_buf_iodone( - struct xfs_buf *bp, - struct xfs_log_item *lip) +xfs_buf_inode_iodone( + struct xfs_buf *bp) { - struct xfs_ail *ailp = lip->li_ailp; + if (bp->b_error) { + struct xfs_log_item *lip; + int ret = xfs_buf_iodone_error(bp); + + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + set_bit(XFS_LI_FAILED, &lip->li_flags); + } + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } - ASSERT(BUF_ITEM(lip)->bli_buf == bp); +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); + xfs_buf_item_done(bp); + xfs_iflush_done(bp); + xfs_buf_ioend_finish(bp); +} - xfs_buf_rele(bp); +/* + * Dquot buffer iodone callback function. + */ +void +xfs_buf_dquot_iodone( + struct xfs_buf *bp) +{ + if (bp->b_error) { + struct xfs_log_item *lip; + int ret = xfs_buf_iodone_error(bp); + + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + xfs_set_li_failed(lip, bp); + } + spin_unlock(&bp->b_mount->m_ail->ail_lock); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } - /* - * If we are forcibly shutting down, this may well be - * off the AIL already. That's because we simulate the - * log-committed callbacks to unpin these buffers. Or we may never - * have put this item on AIL because of the transaction was - * aborted forcibly. xfs_trans_ail_delete() takes care of these. - * - * Either way, AIL is useless if we're forcing a shutdown. - */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); - xfs_buf_item_free(BUF_ITEM(lip)); +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); + /* a newly allocated dquot buffer might have a log item attached */ + xfs_buf_item_done(bp); + xfs_dquot_done(bp); + xfs_buf_ioend_finish(bp); } /* - * Requeue a failed buffer for writeback. + * Dirty buffer iodone callback function. * - * We clear the log item failed state here as well, but we have to be careful - * about reference counts because the only active reference counts on the buffer - * may be the failed log items. Hence if we clear the log item failed state - * before queuing the buffer for IO we can release all active references to - * the buffer and free it, leading to use after free problems in - * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which - * order we process them in - the buffer is locked, and we own the buffer list - * so nothing on them is going to change while we are performing this action. - * - * Hence we can safely queue the buffer for IO before we clear the failed log - * item state, therefore always having an active reference to the buffer and - * avoiding the transient zero-reference state that leads to use-after-free. - * - * Return true if the buffer was added to the buffer list, false if it was - * already on the buffer list. + * Note that for things like remote attribute buffers, there may not be a buffer + * log item here, so processing the buffer log item must remain be optional. */ -bool -xfs_buf_resubmit_failed_buffers( - struct xfs_buf *bp, - struct list_head *buffer_list) +void +xfs_buf_iodone( + struct xfs_buf *bp) { - struct xfs_log_item *lip; - bool ret; - - ret = xfs_buf_delwri_queue(bp, buffer_list); - - /* - * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this - * function already have it acquired - */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_clear_li_failed(lip); + if (bp->b_error) { + int ret = xfs_buf_iodone_error(bp); + + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + ASSERT(list_empty(&bp->b_li_list)); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } - return ret; +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); + xfs_buf_item_done(bp); + xfs_buf_ioend_finish(bp); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 30114b510332..23507cbb4c41 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -54,13 +54,9 @@ void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); -void xfs_buf_attach_iodone(struct xfs_buf *, - void(*)(struct xfs_buf *, struct xfs_log_item *), - struct xfs_log_item *); -void xfs_buf_iodone_callbacks(struct xfs_buf *); -void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, - struct list_head *); +void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c new file mode 100644 index 000000000000..8f0457d67d77 --- /dev/null +++ b/fs/xfs/xfs_buf_item_recover.c @@ -0,0 +1,982 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_error.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_quota.h" + +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +static struct xfs_buf_cancel * +xlog_find_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + if (!log->l_buf_cancel_table) + return NULL; + + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; + } + + return NULL; +} + +static bool +xlog_add_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct xfs_buf_cancel *bcp; + + /* + * If we find an existing cancel record, this indicates that the buffer + * was cancelled multiple times. To ensure that during pass 2 we keep + * the record in the table until we reach its last occurrence in the + * log, a reference count is kept to tell how many times we expect to + * see this record during the second pass. + */ + bcp = xlog_find_buffer_cancelled(log, blkno, len); + if (bcp) { + bcp->bc_refcount++; + return false; + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); + bcp->bc_blkno = blkno; + bcp->bc_len = len; + bcp->bc_refcount = 1; + list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); + return true; +} + +/* + * Check if there is and entry for blkno, len in the buffer cancel record table. + */ +bool +xlog_is_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + return xlog_find_buffer_cancelled(log, blkno, len) != NULL; +} + +/* + * Check if there is and entry for blkno, len in the buffer cancel record table, + * and decremented the reference count on it if there is one. + * + * Remove the cancel record once the refcount hits zero, so that if the same + * buffer is re-used again after its last cancellation we actually replay the + * changes made at that point. + */ +static bool +xlog_put_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_find_buffer_cancelled(log, blkno, len); + if (!bcp) { + ASSERT(0); + return false; + } + + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + return true; +} + +/* log buffer item recovery */ + +/* + * Sort buffer items for log recovery. Most buffer items should end up on the + * buffer list and are recovered first, with the following exceptions: + * + * 1. XFS_BLF_CANCEL buffers must be processed last because some log items + * might depend on the incor ecancellation record, and replaying a cancelled + * buffer item can remove the incore record. + * + * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that + * we replay di_next_unlinked only after flushing the inode 'free' state + * to the inode buffer. + * + * See xlog_recover_reorder_trans for more details. + */ +STATIC enum xlog_recover_reorder +xlog_recover_buf_reorder( + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + + if (buf_f->blf_flags & XFS_BLF_CANCEL) + return XLOG_REORDER_CANCEL_LIST; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + return XLOG_REORDER_INODE_BUFFER_LIST; + return XLOG_REORDER_BUFFER_LIST; +} + +STATIC void +xlog_recover_buf_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); +} + +/* + * Build up the table of buf cancel records so that we don't replay cancelled + * data in the second pass. + */ +static int +xlog_recover_buf_commit_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; + + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + if (!(bf->blf_flags & XFS_BLF_CANCEL)) + trace_xfs_log_recover_buf_not_cancel(log, bf); + else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) + trace_xfs_log_recover_buf_cancel_add(log, bf); + else + trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); + return 0; +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; + char *warnmsg = NULL; + + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_cntbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + case XFS_RMAP_CRC_MAGIC: + bp->b_ops = &xfs_rmapbt_buf_ops; + break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; + default: + warnmsg = "Bad btree block magic!"; + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + warnmsg = "Bad AGF block magic!"; + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (magic32 != XFS_AGFL_MAGIC) { + warnmsg = "Bad AGFL block magic!"; + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + warnmsg = "Bad AGI block magic!"; + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + warnmsg = "Bad DQUOT block magic!"; + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + if (magic16 != XFS_DINODE_MAGIC) { + warnmsg = "Bad INODE block magic!"; + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + warnmsg = "Bad symlink block magic!"; + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + warnmsg = "Bad dir block magic!"; + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + warnmsg = "Bad dir data magic!"; + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + warnmsg = "Bad dir3 free magic!"; + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + warnmsg = "Bad dir leaf1 magic!"; + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + warnmsg = "Bad dir leafn magic!"; + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + warnmsg = "Bad da node magic!"; + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + warnmsg = "Bad attr leaf magic!"; + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + warnmsg = "Bad attr remote magic!"; + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + warnmsg = "Bad SB block magic!"; + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; +#ifdef CONFIG_XFS_RT + case XFS_BLFT_RTBITMAP_BUF: + case XFS_BLFT_RTSUMMARY_BUF: + /* no magic numbers for verification of RT buffers */ + bp->b_ops = &xfs_rtbuf_ops; + break; +#endif /* CONFIG_XFS_RT */ + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } + + /* + * Nothing else to do in the case of a NULL current LSN as this means + * the buffer is more recent than the change in the log and will be + * skipped. + */ + if (current_lsn == NULLCOMMITLSN) + return; + + if (warnmsg) { + xfs_warn(mp, warnmsg); + ASSERT(0); + } + + /* + * We must update the metadata LSN of the buffer as it is written out to + * ensure that older transactions never replay over this one and corrupt + * the buffer. This can occur if log recovery is interrupted at some + * point after the current transaction completes, at which point a + * subsequent mount starts recovery from the beginning. + * + * Write verifiers update the metadata LSN from log items attached to + * the buffer. Therefore, initialize a bli purely to carry the LSN to + * the verifier. We'll clean it up in our ->iodone() callback. + */ + if (bp->b_ops) { + struct xfs_buf_log_item *bip; + + bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_item_init(bp, mp); + bip = bp->b_log_item; + bip->bli_item.li_lsn = current_lsn; + } +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +STATIC void +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + int i; + int bit; + int nbits; + xfs_failaddr_t fa; + const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); + + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* + * Do a sanity check if this is a dquot buffer. Just checking + * the first dquot in the buffer should do. XXXThis is + * probably a good thing to do for other buf types also. + */ + fa = NULL; + if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < size_disk_dquot) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); + if (fa) { + xfs_alert(mp, + "dquot corrupt at %pS trying to replay into block 0x%llx", + fa, bp->b_bn); + goto next; + } + } + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<<XFS_BLF_SHIFT); /* length */ + next: + i++; + bit += nbits; + } + + /* Shouldn't be any more regions */ + ASSERT(i == item->ri_total); + + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF log item of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. + */ +STATIC bool +xlog_recover_do_dquot_buffer( + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + uint type; + + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (!mp->m_qflags) + return false; + + type = 0; + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) + type |= XFS_DQTYPE_USER; + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) + type |= XFS_DQTYPE_PROJ; + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) + type |= XFS_DQTYPE_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return false; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); + return true; +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + int i; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) + return 0; + + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) + continue; + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; + if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { + xfs_alert(mp, + "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " + "Trying to replay bad (0) inode di_next_unlinked field.", + item, bp); + return -EFSCORRUPTED; + } + + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); + *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + + } + + return 0; +} + +/* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; + case XFS_SB_MAGIC: + /* + * superblock uuids are magic. We may or may not have a + * sb_meta_uuid on disk, but it will be set in the in-core + * superblock. We set the uuid pointer for verification + * according to the superblock feature mask to ensure we check + * the relevant UUID in the superblock. + */ + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; + else + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLF_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_buf_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_buf_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + int error; + uint buf_flags; + xfs_lsn_t lsn; + + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len)) + goto cancelled; + } else { + + if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len)) + goto cancelled; + } + + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, &bp, NULL); + if (error) + return error; + + /* + * Recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + goto out_release; + } + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + } + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or inode_cluster_size bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if + * the inode buffer size isn't max(blocksize, inode_cluster_size) + * for *our* value of inode_cluster_size, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); + } else { + ASSERT(bp->b_mount == mp); + bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(bp, buffer_list); + } + +out_release: + xfs_buf_relse(bp); + return error; +cancelled: + trace_xfs_log_recover_buf_cancel(log, buf_f); + return 0; +} + +const struct xlog_recover_item_ops xlog_buf_item_ops = { + .item_type = XFS_LI_BUF, + .reorder = xlog_recover_buf_reorder, + .ra_pass2 = xlog_recover_buf_ra_pass2, + .commit_pass1 = xlog_recover_buf_commit_pass1, + .commit_pass2 = xlog_recover_buf_commit_pass2, +}; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 0d3b640cf1cc..66deddd5e296 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -147,7 +147,7 @@ xfs_dir2_block_getdents( xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; int lock_mode; - unsigned int offset; + unsigned int offset, next_offset; unsigned int end; /* @@ -173,9 +173,10 @@ xfs_dir2_block_getdents( * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ - offset = geo->data_entry_offset; end = xfs_dir3_data_end_offset(geo, bp->b_addr); - while (offset < end) { + for (offset = geo->data_entry_offset; + offset < end; + offset = next_offset) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; @@ -184,14 +185,15 @@ xfs_dir2_block_getdents( * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - offset += be16_to_cpu(dup->length); + next_offset = offset + be16_to_cpu(dup->length); continue; } /* * Bump pointer for the next iteration. */ - offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen); + next_offset = offset + + xfs_dir2_data_entsize(dp->i_mount, dep->namelen); /* * The entry is before the desired starting point, skip it. @@ -522,7 +524,7 @@ xfs_readdir( args.geo = dp->i_mount->m_dir_geo; args.trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) ; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0b8350e84d28..f979d0d7e6cd 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -31,6 +31,7 @@ xfs_trim_extents( struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; + struct xfs_agf *agf; struct xfs_perag *pag; int error; int i; @@ -47,14 +48,14 @@ xfs_trim_extents( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) goto out_put_perag; + agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); if (error) goto out_del_cursor; @@ -75,7 +76,7 @@ xfs_trim_extents( error = -EFSCORRUPTED; goto out_del_cursor; } - ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + ASSERT(flen <= be32_to_cpu(agf->agf_longest)); /* * use daddr format for all range/len calculations as that is diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d223e1ae90a6..bcd73b9c2994 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -23,6 +23,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_error.h" /* * Lock order: @@ -66,39 +67,61 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - struct xfs_mount *mp, struct xfs_dquot *dq) { + struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_def_quota *defq; int prealloc = 0; - ASSERT(d->d_id); - defq = xfs_get_defquota(dq, q); + ASSERT(dq->q_id); + defq = xfs_get_defquota(q, xfs_dquot_type(dq)); - if (defq->bsoftlimit && !d->d_blk_softlimit) { - d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); + if (!dq->q_blk.softlimit) { + dq->q_blk.softlimit = defq->blk.soft; prealloc = 1; } - if (defq->bhardlimit && !d->d_blk_hardlimit) { - d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit); + if (!dq->q_blk.hardlimit) { + dq->q_blk.hardlimit = defq->blk.hard; prealloc = 1; } - if (defq->isoftlimit && !d->d_ino_softlimit) - d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit); - if (defq->ihardlimit && !d->d_ino_hardlimit) - d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit); - if (defq->rtbsoftlimit && !d->d_rtb_softlimit) - d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit); - if (defq->rtbhardlimit && !d->d_rtb_hardlimit) - d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit); + if (!dq->q_ino.softlimit) + dq->q_ino.softlimit = defq->ino.soft; + if (!dq->q_ino.hardlimit) + dq->q_ino.hardlimit = defq->ino.hard; + if (!dq->q_rtb.softlimit) + dq->q_rtb.softlimit = defq->rtb.soft; + if (!dq->q_rtb.hardlimit) + dq->q_rtb.hardlimit = defq->rtb.hard; if (prealloc) xfs_dquot_set_prealloc_limits(dq); } /* + * Determine if this quota counter is over either limit and set the quota + * timers as appropriate. + */ +static inline void +xfs_qm_adjust_res_timer( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim) +{ + ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit); + + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (res->timer == 0) + res->timer = ktime_get_real_seconds() + qlim->time; + } else { + if (res->timer == 0) + res->warnings = 0; + else + res->timer = 0; + } +} + +/* * Check the limits and timers of a dquot and start or reset timers * if necessary. * This gets called even when quota enforcement is OFF, which makes our @@ -113,91 +136,18 @@ xfs_qm_adjust_dqlimits( */ void xfs_qm_adjust_dqtimers( - struct xfs_mount *mp, - struct xfs_disk_dquot *d) + struct xfs_dquot *dq) { - ASSERT(d->d_id); - -#ifdef DEBUG - if (d->d_blk_hardlimit) - ASSERT(be64_to_cpu(d->d_blk_softlimit) <= - be64_to_cpu(d->d_blk_hardlimit)); - if (d->d_ino_hardlimit) - ASSERT(be64_to_cpu(d->d_ino_softlimit) <= - be64_to_cpu(d->d_ino_hardlimit)); - if (d->d_rtb_hardlimit) - ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= - be64_to_cpu(d->d_rtb_hardlimit)); -#endif - - if (!d->d_btimer) { - if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) > - be64_to_cpu(d->d_blk_softlimit))) || - (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) > - be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_btimelimit); - } else { - d->d_bwarns = 0; - } - } else { - if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) <= - be64_to_cpu(d->d_blk_softlimit))) && - (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) <= - be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = 0; - } - } + struct xfs_mount *mp = dq->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_def_quota *defq; - if (!d->d_itimer) { - if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) > - be64_to_cpu(d->d_ino_softlimit))) || - (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) > - be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_itimelimit); - } else { - d->d_iwarns = 0; - } - } else { - if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) <= - be64_to_cpu(d->d_ino_softlimit))) && - (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) <= - be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = 0; - } - } + ASSERT(dq->q_id); + defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); - if (!d->d_rtbtimer) { - if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) > - be64_to_cpu(d->d_rtb_softlimit))) || - (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) > - be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_rtbtimelimit); - } else { - d->d_rtbwarns = 0; - } - } else { - if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) <= - be64_to_cpu(d->d_rtb_softlimit))) && - (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) <= - be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = 0; - } - } + xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); } /* @@ -205,20 +155,40 @@ xfs_qm_adjust_dqtimers( */ STATIC void xfs_qm_init_dquot_blk( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct xfs_buf *bp) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_dqblk_t *d; - xfs_dqid_t curid; - int i; + struct xfs_dqblk *d; + xfs_dqid_t curid; + unsigned int qflag; + unsigned int blftype; + int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); + switch (type) { + case XFS_DQTYPE_USER: + qflag = XFS_UQUOTA_CHKD; + blftype = XFS_BLF_UDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + qflag = XFS_PQUOTA_CHKD; + blftype = XFS_BLF_PDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + qflag = XFS_GQUOTA_CHKD; + blftype = XFS_BLF_GDQUOT_BUF; + break; + default: + ASSERT(0); + return; + } + d = bp->b_addr; /* @@ -230,7 +200,7 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); - d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_type = type; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -238,11 +208,28 @@ xfs_qm_init_dquot_blk( } } - xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : - XFS_BLF_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); + xfs_trans_dquot_buf(tp, bp, blftype); + + /* + * quotacheck uses delayed writes to update all the dquots on disk in an + * efficient manner instead of logging the individual dquot changes as + * they are made. However if we log the buffer allocated here and crash + * after quotacheck while the logged initialisation is still in the + * active region of the log, log recovery can replay the dquot buffer + * initialisation over the top of the checked dquots and corrupt quota + * accounting. + * + * To avoid this problem, quotacheck cannot log the initialised buffer. + * We must still dirty the buffer and write it back before the + * allocation transaction clears the log. Therefore, mark the buffer as + * ordered instead of logging it directly. This is safe for quotacheck + * because it detects and repairs allocated but initialized dquot blocks + * in the quota inodes. + */ + if (!(mp->m_qflags & qflag)) + xfs_trans_ordered_buf(tp, bp); + else + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } /* @@ -255,8 +242,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { uint64_t space; - dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); - dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; + dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; if (!dqp->q_prealloc_lo_wmark) { dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; do_div(dqp->q_prealloc_lo_wmark, 100); @@ -286,14 +273,15 @@ xfs_dquot_disk_alloc( struct xfs_trans *tp = *tpp; struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_dqtype_t qtype = xfs_dquot_type(dqp); + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); int nmaps = 1; int error; trace_xfs_dqalloc(dqp); xfs_ilock(quotip, XFS_ILOCK_EXCL); - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock @@ -330,8 +318,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), - dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -378,13 +365,14 @@ xfs_dquot_disk_read( { struct xfs_bmbt_irec map; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_dqtype_t qtype = xfs_dquot_type(dqp); + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); uint lock_mode; int nmaps = 1; int error; lock_mode = xfs_ilock_data_map_shared(quotip); - if (!xfs_this_quota_on(mp, dqp->dq_flags)) { + if (!xfs_this_quota_on(mp, qtype)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -436,14 +424,14 @@ STATIC struct xfs_dquot * xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { struct xfs_dquot *dqp; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); + dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_type = type; + dqp->q_id = id; dqp->q_mount = mp; INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); @@ -468,13 +456,13 @@ xfs_dquot_alloc( * quotas. */ switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: /* uses the default lock class */ break; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); break; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); break; default: @@ -489,26 +477,91 @@ xfs_dquot_alloc( } /* Copy the in-core quota fields in from the on-disk buffer. */ -STATIC void +STATIC int xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + /* + * Ensure that we got the type and ID we were looking for. + * Everything else was checked by the dquot buffer verifier. + */ + if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || + be32_to_cpu(ddqp->d_id) != dqp->q_id) { + xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, quota %u", + __this_address, dqp->q_id); + xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); + return -EFSCORRUPTED; + } + /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); + dqp->q_type = ddqp->d_type; + dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); + dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit); + dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + + dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount); + dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); + dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); + + dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); + dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); + dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); + + dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); + dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); + dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. */ - dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); - dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); - dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + dqp->q_blk.reserved = dqp->q_blk.count; + dqp->q_ino.reserved = dqp->q_ino.count; + dqp->q_rtb.reserved = dqp->q_rtb.count; /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); + return 0; +} + +/* Copy the in-core quota fields into the on-disk buffer. */ +void +xfs_dquot_to_disk( + struct xfs_disk_dquot *ddqp, + struct xfs_dquot *dqp) +{ + ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddqp->d_version = XFS_DQUOT_VERSION; + ddqp->d_type = dqp->q_type; + ddqp->d_id = cpu_to_be32(dqp->q_id); + ddqp->d_pad0 = 0; + ddqp->d_pad = 0; + + ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); + ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); + ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); + ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); + ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); + ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); + + ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); + ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); + ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); + + ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); + ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); + ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + + ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); + ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); + ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -557,7 +610,7 @@ static int xfs_qm_dqread( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp) { @@ -582,9 +635,11 @@ xfs_qm_dqread( * further. */ ASSERT(xfs_buf_islocked(bp)); - xfs_dquot_from_disk(dqp, bp); - + error = xfs_dquot_from_disk(dqp, bp); xfs_buf_relse(bp); + if (error) + goto err; + *dqpp = dqp; return error; @@ -603,7 +658,7 @@ err: static int xfs_dq_get_next_id( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, xfs_dqid_t *id) { struct xfs_inode *quotip = xfs_quota_inode(mp, type); @@ -671,7 +726,7 @@ restart: } xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { + if (dqp->q_flags & XFS_DQFLAG_FREEING) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); @@ -727,21 +782,21 @@ xfs_qm_dqget_cache_insert( static int xfs_qm_dqget_checks( struct xfs_mount *mp, - uint type) + xfs_dqtype_t type) { if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) return -ESRCH; switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) return -ESRCH; return 0; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: if (!XFS_IS_GQUOTA_ON(mp)) return -ESRCH; return 0; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: if (!XFS_IS_PQUOTA_ON(mp)) return -ESRCH; return 0; @@ -752,14 +807,14 @@ xfs_qm_dqget_checks( } /* - * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked + * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { @@ -809,7 +864,7 @@ int xfs_qm_dqget_uncached( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_dquot **dqpp) { int error; @@ -825,14 +880,14 @@ xfs_qm_dqget_uncached( xfs_dqid_t xfs_qm_id_for_quotatype( struct xfs_inode *ip, - uint type) + xfs_dqtype_t type) { switch (type) { - case XFS_DQ_USER: - return ip->i_d.di_uid; - case XFS_DQ_GROUP: - return ip->i_d.di_gid; - case XFS_DQ_PROJ: + case XFS_DQTYPE_USER: + return i_uid_read(VFS_I(ip)); + case XFS_DQTYPE_GROUP: + return i_gid_read(VFS_I(ip)); + case XFS_DQTYPE_PROJ: return ip->i_d.di_projid; } ASSERT(0); @@ -847,7 +902,7 @@ xfs_qm_id_for_quotatype( int xfs_qm_dqget_inode( struct xfs_inode *ip, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { @@ -933,7 +988,7 @@ int xfs_qm_dqget_next( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; @@ -1013,14 +1068,14 @@ xfs_qm_dqrele( * from the AIL if it has not been re-logged, and unlocking the dquot's * flush lock. This behavior is very similar to that of inodes.. */ -STATIC void +static void xfs_qm_dqflush_done( - struct xfs_buf *bp, struct xfs_log_item *lip) { struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; + xfs_lsn_t tail_lsn; /* * We only want to pull the item from the AIL if its @@ -1034,16 +1089,13 @@ xfs_qm_dqflush_done( ((lip->li_lsn == qip->qli_flush_lsn) || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->ail_lock); + xfs_clear_li_failed(lip); if (lip->li_lsn == qip->qli_flush_lsn) { - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + /* xfs_ail_update_finish() drops the AIL lock */ + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } else { - /* - * Clear the failed state since we are about to drop the - * flush lock - */ - xfs_clear_li_failed(lip); spin_unlock(&ailp->ail_lock); } } @@ -1054,6 +1106,48 @@ xfs_qm_dqflush_done( xfs_dqfunlock(dqp); } +void +xfs_dquot_done( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip, *n; + + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + list_del_init(&lip->li_bio_list); + xfs_qm_dqflush_done(lip); + } +} + +/* Check incore dquot for errors before we flush. */ +static xfs_failaddr_t +xfs_qm_dqflush_check( + struct xfs_dquot *dqp) +{ + xfs_dqtype_t type = xfs_dquot_type(dqp); + + if (type != XFS_DQTYPE_USER && + type != XFS_DQTYPE_GROUP && + type != XFS_DQTYPE_PROJ) + return __this_address; + + if (dqp->q_id == 0) + return NULL; + + if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && + !dqp->q_blk.timer) + return __this_address; + + if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && + !dqp->q_ino.timer) + return __this_address; + + if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && + !dqp->q_rtb.timer) + return __this_address; + + return NULL; +} + /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. @@ -1068,9 +1162,9 @@ xfs_qm_dqflush( struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; struct xfs_buf *bp; - struct xfs_dqblk *dqb; - struct xfs_disk_dquot *ddqp; + struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1084,58 +1178,33 @@ xfs_qm_dqflush( xfs_qm_dqunpin_wait(dqp); /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an emptry AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - struct xfs_log_item *lip = &dqp->q_logitem.qli_item; - dqp->dq_flags &= ~XFS_DQ_DIRTY; - - xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); - - error = -EIO; - goto out_unlock; - } - - /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); - if (error) + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); + if (error == -EAGAIN) goto out_unlock; + if (error) + goto out_abort; - /* - * Calculate the location of the dquot inside the buffer. - */ - dqb = bp->b_addr + dqp->q_bufoffset; - ddqp = &dqb->dd_diskdq; - - /* - * A simple sanity check in case we got a corrupted dquot. - */ - fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0); + fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - be32_to_cpu(ddqp->d_id), fa); + dqp->q_id, fa); xfs_buf_relse(bp); - xfs_dqfunlock(dqp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + goto out_abort; } - /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); + /* Flush the incore dquot to the ondisk buffer. */ + dqblk = bp->b_addr + dqp->q_bufoffset; + xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* * Clear the dirty field and remember the flush lsn for later use. */ - dqp->dq_flags &= ~XFS_DQ_DIRTY; + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); @@ -1150,17 +1219,17 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); - xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } /* - * Attach an iodone routine so that we can remove this dquot from the - * AIL and release the flush lock once the dquot is synced to disk. + * Attach the dquot to the buffer so that we can remove this dquot from + * the AIL and release the flush lock once the dquot is synced to disk. */ - xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, - &dqp->q_logitem.qli_item); + bp->b_flags |= _XBF_DQUOTS; + list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't @@ -1175,9 +1244,13 @@ xfs_qm_dqflush( *bpp = bp; return 0; +out_abort: + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; + xfs_trans_ail_delete(lip, 0); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* @@ -1193,8 +1266,7 @@ xfs_dqlock2( { if (d1 && d2) { ASSERT(d1 != d2); - if (be32_to_cpu(d1->q_core.d_id) > - be32_to_cpu(d2->q_core.d_id)) { + if (d1->q_id > d2->q_id) { mutex_lock(&d2->q_qlock); mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { @@ -1246,7 +1318,7 @@ xfs_qm_exit(void) int xfs_qm_dqiterate( struct xfs_mount *mp, - uint dqtype, + xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv) { @@ -1255,16 +1327,15 @@ xfs_qm_dqiterate( int error; do { - error = xfs_qm_dqget_next(mp, id, dqtype, &dq); + error = xfs_qm_dqget_next(mp, id, type, &dq); if (error == -ENOENT) return 0; if (error) return error; - error = iter_fn(dq, dqtype, priv); - id = be32_to_cpu(dq->q_core.d_id); + error = iter_fn(dq, type, priv); + id = dq->q_id; xfs_qm_dqput(dq); - id++; } while (error == 0 && id != 0); return error; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index fe3e46df604b..282a65da93c7 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -27,26 +27,53 @@ enum { XFS_QLOWSP_MAX }; +struct xfs_dquot_res { + /* Total resources allocated and reserved. */ + xfs_qcnt_t reserved; + + /* Total resources allocated. */ + xfs_qcnt_t count; + + /* Absolute and preferred limits. */ + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + + /* + * For root dquots, this is the default grace period, in seconds. + * Otherwise, this is when the quota grace period expires, + * in seconds since the Unix epoch. + */ + time64_t timer; + + /* + * For root dquots, this is the maximum number of warnings that will + * be issued for this quota type. Otherwise, this is the number of + * warnings issued against this quota. Note that none of this is + * implemented. + */ + xfs_qwarncnt_t warnings; +}; + /* * The incore dquot structure */ struct xfs_dquot { - uint dq_flags; struct list_head q_lru; struct xfs_mount *q_mount; + xfs_dqtype_t q_type; + uint16_t q_flags; + xfs_dqid_t q_id; uint q_nrefs; - xfs_daddr_t q_blkno; int q_bufoffset; + xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; - struct xfs_disk_dquot q_core; + struct xfs_dquot_res q_blk; /* regular blocks */ + struct xfs_dquot_res q_ino; /* inodes */ + struct xfs_dquot_res q_rtb; /* realtime blocks */ + struct xfs_dq_logitem q_logitem; - /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_bcount; - /* total inos allocd+reserved */ - xfs_qcnt_t q_res_icount; - /* total realtime blks used+reserved */ - xfs_qcnt_t q_res_rtbcount; + xfs_qcnt_t q_prealloc_lo_wmark; xfs_qcnt_t q_prealloc_hi_wmark; int64_t q_low_space[XFS_QLOWSP_MAX]; @@ -101,34 +128,59 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp) mutex_unlock(&dqp->q_qlock); } -static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +static inline int +xfs_dquot_type(const struct xfs_dquot *dqp) { - switch (type & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + return dqp->q_type & XFS_DQTYPE_REC_MASK; +} + +static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ON(mp); - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return XFS_IS_GQUOTA_ON(mp); - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return XFS_IS_PQUOTA_ON(mp); default: return 0; } } -static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) +static inline struct xfs_dquot *xfs_inode_dquot( + struct xfs_inode *ip, + xfs_dqtype_t type) { - switch (type & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (type) { + case XFS_DQTYPE_USER: return ip->i_udquot; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return ip->i_gdquot; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return ip->i_pdquot; default: return NULL; } } +/* Decide if the dquot's limits are actually being enforced. */ +static inline bool +xfs_dquot_is_enforced( + const struct xfs_dquot *dqp) +{ + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_USER: + return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount); + case XFS_DQTYPE_GROUP: + return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount); + case XFS_DQTYPE_PROJ: + return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount); + } + ASSERT(0); + return false; +} + /* * Check whether a dquot is under low free space conditions. We assume the quota * is enabled and enforced. @@ -137,38 +189,35 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) { int64_t freesp; - freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) return true; return false; } +void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) -#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) -#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) -#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); -void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, - struct xfs_disk_dquot *d); -void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, - struct xfs_dquot *d); -xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); +void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); +void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); +xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, + xfs_dqtype_t type); int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, - uint type, bool can_alloc, - struct xfs_dquot **dqpp); -int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, - bool can_alloc, - struct xfs_dquot **dqpp); + xfs_dqtype_t type, bool can_alloc, + struct xfs_dquot **dqpp); +int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type, + bool can_alloc, struct xfs_dquot **dqpp); int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, - uint type, struct xfs_dquot **dqpp); + xfs_dqtype_t type, struct xfs_dquot **dqpp); int xfs_qm_dqget_uncached(struct xfs_mount *mp, - xfs_dqid_t id, uint type, - struct xfs_dquot **dqpp); + xfs_dqid_t id, xfs_dqtype_t type, + struct xfs_dquot **dqpp); void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); @@ -183,9 +232,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, - void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, +typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, + xfs_dqtype_t type, void *priv); +int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index d60647d7197b..8c1fdf37ee8f 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -45,6 +45,7 @@ xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { + struct xfs_disk_dquot ddq; struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_log_iovec *vecp = NULL; struct xfs_dq_logformat *qlf; @@ -52,14 +53,15 @@ xfs_qm_dquot_logitem_format( qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); qlf->qlf_type = XFS_LI_DQUOT; qlf->qlf_size = 2; - qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_id = qlip->qli_dquot->q_id; qlf->qlf_blkno = qlip->qli_dquot->q_blkno; qlf->qlf_len = 1; qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, - &qlip->qli_dquot->q_core, + xfs_dquot_to_disk(&ddq, qlip->qli_dquot); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq, sizeof(struct xfs_disk_dquot)); } @@ -113,23 +115,6 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer - * have been failed during writeback - * - * this informs the AIL that the dquot is already flush locked on the next push, - * and acquires a hold on the buffer to ensure that it isn't reclaimed before - * dirty data makes it to disk. - */ -STATIC void -xfs_dquot_item_error( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush)); - xfs_set_li_failed(lip, bp); -} - STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -145,21 +130,6 @@ xfs_qm_dquot_logitem_push( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - /* - * The buffer containing this item failed to be written back - * previously. Resubmit the buffer for IO - */ - if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { - if (!xfs_buf_trylock(bp)) - return XFS_ITEM_LOCKED; - - if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) - rval = XFS_ITEM_FLUSHING; - - xfs_buf_unlock(bp); - return rval; - } - if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -189,7 +159,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -230,7 +201,6 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_release = xfs_qm_dquot_logitem_release, .iop_committing = xfs_qm_dquot_logitem_committing, .iop_push = xfs_qm_dquot_logitem_push, - .iop_error = xfs_dquot_item_error }; /* @@ -307,36 +277,62 @@ xfs_qm_qoffend_logitem_committed( { struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - struct xfs_ail *ailp = qfs->qql_item.li_ailp; - /* - * Delete the qoff-start logitem from the AIL. - * xfs_trans_ail_delete() drops the AIL lock. - */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + xfs_qm_qoff_logitem_relse(qfs); - kmem_free(qfs->qql_item.li_lv_shadow); kmem_free(lip->li_lv_shadow); - kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; } +STATIC void +xfs_qm_qoff_logitem_release( + struct xfs_log_item *lip) +{ + struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); + + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { + if (qoff->qql_start_lip) + xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); + xfs_qm_qoff_logitem_relse(qoff); + } +} + static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; /* + * Delete the quotaoff intent from the AIL and free it. On success, + * this should only be called for the start item. It can be used for + * either on shutdown or abort. + */ +void +xfs_qm_qoff_logitem_relse( + struct xfs_qoff_logitem *qoff) +{ + struct xfs_log_item *lip = &qoff->qql_item; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || + test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_delete(lip, 0); + kmem_free(lip->li_lv_shadow); + kmem_free(qoff); +} + +/* * Allocate and initialize an quotaoff item of the correct quota type(s). */ struct xfs_qoff_logitem * diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 3bb19e556ade..2b86a43d7ce2 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -28,6 +28,7 @@ void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, struct xfs_qoff_logitem *start, uint flags); +void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, struct xfs_qoff_logitem *startqoff, uint flags); diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c new file mode 100644 index 000000000000..5875c7e1bd28 --- /dev/null +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_type & XFS_DQTYPE_REC_MASK; + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xlog_buf_readahead(log, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), + &xfs_dquot_buf_ra_ops); +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_dquot_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddq, *recddq; + struct xfs_dq_logformat *dq_f; + xfs_failaddr_t fa; + int error; + uint type; + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return 0; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return -EFSCORRUPTED; + } + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return -EFSCORRUPTED; + } + + /* + * This type of quotas was turned off, so ignore this record. + */ + type = recddq->d_type & XFS_DQTYPE_REC_MASK; + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return 0; + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", + dq_f->qlf_id, fa); + return -EFSCORRUPTED; + } + ASSERT(dq_f->qlf_len == 1); + + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + &xfs_dquot_buf_ops); + if (error) + return error; + + ASSERT(bp); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + ASSERT(dq_f->qlf_size == 2); + ASSERT(bp->b_mount == mp); + bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); + return 0; +} + +const struct xlog_recover_item_ops xlog_dquot_item_ops = { + .item_type = XFS_LI_DQUOT, + .ra_pass2 = xlog_recover_dquot_ra_pass2, + .commit_pass2 = xlog_recover_dquot_commit_pass2, +}; + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_quotaoff_commit_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group/project quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQTYPE_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP; + + return 0; +} + +const struct xlog_recover_item_ops xlog_quotaoff_item_ops = { + .item_type = XFS_LI_QUOTAOFF, + .commit_pass1 = xlog_recover_quotaoff_commit_pass1, + /* nothing to commit in pass2 */ +}; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 331765afc53e..7f6e20899473 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -53,6 +53,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, XFS_RANDOM_IUNLINK_FALLBACK, + XFS_RANDOM_BUF_IOERROR, }; struct xfs_errortag_attr { @@ -162,6 +163,7 @@ XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); +XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -199,6 +201,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), + XFS_ERRORTAG_ATTR_LIST(buf_ioerror), NULL, }; @@ -345,16 +348,19 @@ xfs_corruption_error( * Complain about the kinds of metadata corruption that we can't detect from a * verifier, such as incorrect inter-block relationship data. Does not set * bp->b_error. + * + * Call xfs_buf_mark_corrupt, not this function. */ void xfs_buf_corruption_error( - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_failaddr_t fa) { struct xfs_mount *mp = bp->b_mount; xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, %s block 0x%llx", - __return_address, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 31a5d321ba9a..1717b7508356 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -15,7 +15,7 @@ extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr); -void xfs_buf_corruption_error(struct xfs_buf *bp); +void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..465fd9e048d4 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -57,7 +56,7 @@ xfs_fs_encode_fh( fileid_type = FILEID_INO32_GEN_PARENT; /* - * If the the filesystem may contain 64bit inode numbers, we need + * If the filesystem may contain 64bit inode numbers, we need * to use larger file handles that can represent them. * * While we only allocate inodes that do not fit into 32 bits any @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6ea847f6e298..6cb8cd11072a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -22,16 +22,20 @@ #include "xfs_bmap.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; +static const struct xfs_item_ops xfs_efi_item_ops; + static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_efi_log_item, efi_item); } -void +STATIC void xfs_efi_item_free( struct xfs_efi_log_item *efip) { @@ -49,13 +53,13 @@ xfs_efi_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the EFI. */ -void +STATIC void xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -139,18 +143,10 @@ xfs_efi_item_release( xfs_efi_release(EFI_ITEM(lip)); } -static const struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = xfs_efi_item_size, - .iop_format = xfs_efi_item_format, - .iop_unpin = xfs_efi_item_unpin, - .iop_release = xfs_efi_item_release, -}; - - /* * Allocate and initialize an efi item with the given number of extents. */ -struct xfs_efi_log_item * +STATIC struct xfs_efi_log_item * xfs_efi_init( struct xfs_mount *mp, uint nextents) @@ -161,11 +157,12 @@ xfs_efi_init( ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(xfs_efi_log_item_t) + + size = (uint)(sizeof(struct xfs_efi_log_item) + ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { - efip = kmem_zone_zalloc(xfs_efi_zone, 0); + efip = kmem_cache_zalloc(xfs_efi_zone, + GFP_KERNEL | __GFP_NOFAIL); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); @@ -184,7 +181,7 @@ xfs_efi_init( * one of which will be the native format for this kernel. * It will handle the conversion of formats if necessary. */ -int +STATIC int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; @@ -336,7 +333,8 @@ xfs_trans_get_efd( (nextents - 1) * sizeof(struct xfs_extent), 0); } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, 0); + efdp = kmem_cache_zalloc(xfs_efd_zone, + GFP_KERNEL | __GFP_NOFAIL); } xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, @@ -412,41 +410,16 @@ xfs_extent_free_diff_items( XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); } -/* Get an EFI. */ -STATIC void * -xfs_extent_free_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_efi_log_item *efip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - efip = xfs_efi_init(tp->t_mountp, count); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - /* Log a free extent to the intent item. */ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_efi_log_item *efip, + struct xfs_extent_free_item *free) { - struct xfs_efi_log_item *efip = intent; - struct xfs_extent_free_item *free; uint next_extent; struct xfs_extent *extp; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); @@ -462,29 +435,50 @@ xfs_extent_free_log_item( extp->ext_len = free->xefi_blockcount; } +static struct xfs_log_item * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_extent_free_item *free; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &efip->efi_item); + if (sort) + list_sort(mp, items, xfs_extent_free_diff_items); + list_for_each_entry(free, items, xefi_list) + xfs_extent_free_log_item(tp, efip, free); + return &efip->efi_item; +} + /* Get an EFD so we can process all the free extents. */ -STATIC void * +static struct xfs_log_item * xfs_extent_free_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_efd(tp, intent, count); + return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; } /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_extent_free_item *free; int error; free = container_of(item, struct xfs_extent_free_item, xefi_list); - error = xfs_trans_free_extent(tp, done_item, + error = xfs_trans_free_extent(tp, EFD_ITEM(done), free->xefi_startblock, free->xefi_blockcount, &free->xefi_oinfo, free->xefi_skip_discard); @@ -495,9 +489,9 @@ xfs_extent_free_finish_item( /* Abort all pending EFIs. */ STATIC void xfs_extent_free_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_efi_release(intent); + xfs_efi_release(EFI_ITEM(intent)); } /* Cancel a free extent. */ @@ -513,10 +507,8 @@ xfs_extent_free_cancel_item( const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_extent_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -529,12 +521,12 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = { STATIC int xfs_agfl_free_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_efd_log_item *efdp = done_item; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_extent_free_item *free; struct xfs_extent *extp; struct xfs_buf *agbp; @@ -579,10 +571,8 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -592,19 +582,19 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ -int -xfs_efi_recover( - struct xfs_mount *mp, - struct xfs_efi_log_item *efip) +STATIC int +xfs_efi_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - struct xfs_efd_log_item *efdp; - struct xfs_trans *tp; - int i; - int error = 0; - xfs_extent_t *extp; - xfs_fsblock_t startblock_fsb; - - ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_efd_log_item *efdp; + struct xfs_trans *tp; + struct xfs_extent *extp; + xfs_fsblock_t startblock_fsb; + int i; + int error = 0; /* * First check the validity of the extents described by the @@ -623,7 +613,6 @@ xfs_efi_recover( * This will pull the EFI from the AIL and * free the memory associated with it. */ - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip); return -EFSCORRUPTED; } @@ -644,7 +633,6 @@ xfs_efi_recover( } - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp); return error; @@ -652,3 +640,93 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_unpin = xfs_efi_item_unpin, + .iop_release = xfs_efi_item_release, + .iop_recover = xfs_efi_item_recover, + .iop_match = xfs_efi_item_match, +}; + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_efi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; + int error; + + efi_formatp = item->ri_buf[0].i_addr; + + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); + return 0; +} + +const struct xlog_recover_item_ops xlog_efi_item_ops = { + .item_type = XFS_LI_EFI, + .commit_pass2 = xlog_recover_efi_commit_pass2, +}; + +/* + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. + */ +STATIC int +xlog_recover_efd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_format *efd_formatp; + + efd_formatp = item->ri_buf[0].i_addr; + ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_efd_item_ops = { + .item_type = XFS_LI_EFD, + .commit_pass2 = xlog_recover_efd_commit_pass2, +}; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 16aaab06d4ec..cd2860c875bf 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -17,11 +17,6 @@ struct kmem_zone; #define XFS_EFI_MAX_FAST_EXTENTS 16 /* - * Define EFI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_EFI_RECOVERED 1 - -/* * This is the "extent free intention" log item. It is used to log the fact * that some extents need to be free. It is used in conjunction with the * "extent free done" log item described below. @@ -50,25 +45,24 @@ struct kmem_zone; * of commit failure or log I/O errors. Note that the EFD is not inserted in the * AIL, so at this point both the EFI and EFD are freed. */ -typedef struct xfs_efi_log_item { +struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; -} xfs_efi_log_item_t; +}; /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item * have been freed. */ -typedef struct xfs_efd_log_item { +struct xfs_efd_log_item { struct xfs_log_item efd_item; - xfs_efi_log_item_t *efd_efip; + struct xfs_efi_log_item *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; -} xfs_efd_log_item_t; +}; /* * Max number of extents in fast allocation path. @@ -78,13 +72,4 @@ typedef struct xfs_efd_log_item { extern struct kmem_zone *xfs_efi_zone; extern struct kmem_zone *xfs_efd_zone; -xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); -int xfs_efi_copy_format(xfs_log_iovec_t *buf, - xfs_efi_log_format_t *dst_efi_fmt); -void xfs_efi_item_free(xfs_efi_log_item_t *); -void xfs_efi_release(struct xfs_efi_log_item *); - -int xfs_efi_recover(struct xfs_mount *mp, - struct xfs_efi_log_item *efip); - #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b8a4a3f29b36..c31cd3be9fb2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int @@ -104,6 +94,7 @@ xfs_file_fsync( { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; @@ -147,13 +138,15 @@ xfs_file_fsync( xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = ip->i_itemp->ili_last_lsn; + (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + lsn = iip->ili_last_lsn; } if (lsn) { error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); - ip->i_itemp->ili_fsync_fields = 0; + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); } xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -515,7 +508,7 @@ xfs_file_dio_aio_write( */ if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); - return -EREMCHG; + return -ENOTBLK; } iolock = XFS_IOLOCK_EXCL; } else { @@ -563,8 +556,8 @@ out: xfs_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS, direct IO will either - * complete fully or fail. + * No fallback to buffered IO after short writes for XFS, direct I/O + * will either complete fully or return an error. */ ASSERT(ret < 0 || ret == count); return ret; @@ -724,7 +717,7 @@ xfs_file_write_iter( * allow an operation to fall back to buffered mode. */ ret = xfs_file_dio_aio_write(iocb, from); - if (ret != -EREMCHG) + if (ret != -ENOTBLK) return ret; } @@ -1045,7 +1038,7 @@ xfs_file_remap_range( /* Prepare and then clone file data. */ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, &len, remap_flags); - if (ret < 0 || len == 0) + if (ret || len == 0) return ret; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); @@ -1069,9 +1062,13 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_log_force_inode(dest); out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); + xfs_iunlock2_io_mmap(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; @@ -1086,7 +1083,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return 0; } @@ -1108,7 +1105,7 @@ xfs_dir_open( * certain to have the next operation be a read there. */ mode = xfs_ilock_data_map_shared(ip); - if (ip->i_d.di_nextents > 0) + if (ip->i_df.if_nextents > 0) error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); return error; @@ -1179,7 +1176,7 @@ xfs_file_llseek( * Locking for serialisation of IO during page faults. This results in a lock * ordering of: * - * mmap_sem (MM) + * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) * i_mmaplock (XFS - truncate serialisation) * page_lock (MM) @@ -1269,10 +1266,23 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } +static void +xfs_filemap_map_pages( + struct vm_fault *vmf, + pgoff_t start_pgoff, + pgoff_t end_pgoff) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + filemap_map_pages(vmf, start_pgoff, end_pgoff); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = filemap_map_pages, + .map_pages = xfs_filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 918456ca29e1..4eebcec4aae6 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -344,7 +344,7 @@ xfs_getfsmap_datadev_helper( xfs_fsblock_t fsb; xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); @@ -362,7 +362,7 @@ xfs_getfsmap_datadev_bnobt_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno, rec->ar_startblock); irec.rm_startblock = rec->ar_startblock; @@ -896,6 +896,14 @@ xfs_getfsmap( info.format_arg = arg; info.head = head; + /* + * If fsmap runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the rmapbt and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while we're reading buffers. + */ + sb_start_write(mp->m_super); + /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ @@ -935,6 +943,7 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); + sb_end_write(mp->m_super); head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3e61d0cc23f8..ef1d5bb88b93 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -504,10 +504,7 @@ xfs_do_force_shutdown( } else if (logerror) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + } else { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, "I/O Error Detected. Shutting down filesystem"); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8dc2e5414276..101028ebb571 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -22,6 +22,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_ialloc.h" #include <linux/iversion.h> @@ -36,13 +37,11 @@ xfs_inode_alloc( struct xfs_inode *ip; /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. + * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL + * and return NULL here on ENOMEM. */ - ip = kmem_zone_alloc(xfs_inode_zone, 0); - if (!ip) - return NULL; + ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); + if (inode_init_always(mp->m_super, VFS_I(ip))) { kmem_cache_free(xfs_inode_zone, ip); return NULL; @@ -62,8 +61,6 @@ xfs_inode_alloc( memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; ip->i_cowfp = NULL; - ip->i_cnextents = 0; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -88,15 +85,18 @@ xfs_inode_free_callback( case S_IFREG: case S_IFDIR: case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); + xfs_idestroy_fork(&ip->i_df); break; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - if (ip->i_cowfp) - xfs_idestroy_fork(ip, XFS_COW_FORK); - + if (ip->i_afp) { + xfs_idestroy_fork(ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + } + if (ip->i_cowfp) { + xfs_idestroy_fork(ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); + } if (ip->i_itemp) { ASSERT(!test_bit(XFS_LI_IN_AIL, &ip->i_itemp->ili_item.li_flags)); @@ -113,6 +113,7 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); @@ -139,11 +140,8 @@ xfs_inode_free( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. + * Queue background inode reclaim work if there are reclaimable inodes and there + * isn't reclaim work already scheduled or in progress. */ static void xfs_reclaim_work_queue( @@ -158,24 +156,6 @@ xfs_reclaim_work_queue( rcu_read_unlock(); } -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - static void xfs_perag_set_reclaim_tag( struct xfs_perag *pag) @@ -289,6 +269,8 @@ xfs_reinit_inode( uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -297,6 +279,8 @@ xfs_reinit_inode( inode_set_iversion_queried(inode, version); inode->i_mode = mode; inode->i_rdev = dev; + inode->i_uid = uid; + inode->i_gid = gid; return error; } @@ -419,6 +403,7 @@ xfs_iget_cache_hit( spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); if (error) { bool wake; @@ -452,9 +437,6 @@ xfs_iget_cache_hit( ip->i_sick = 0; ip->i_checked = 0; - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - init_rwsem(&inode->i_rwsem); - spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); } else { @@ -475,7 +457,7 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); if (!(flags & XFS_IGET_INCORE)) - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + xfs_iflags_clear(ip, XFS_ISTALE); XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -506,18 +488,42 @@ xfs_iget_cache_miss( if (!ip) return -ENOMEM; - error = xfs_iread(mp, tp, ip, flags); + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); if (error) goto out_destroy; - if (!xfs_inode_verify_forks(ip)) { - error = -EFSCORRUPTED; - goto out_destroy; + /* + * For version 5 superblocks, if we are initialising a new inode and we + * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can + * simply build the new inode core with a random generation number. + * + * For version 4 (and older) superblocks, log recovery is dependent on + * the di_flushiter field being initialised from the current on-disk + * value and hence we must also read the inode off disk even when + * initializing new inodes. + */ + if (xfs_sb_version_has_v3inode(&mp->m_sb) && + (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { + VFS_I(ip)->i_generation = prandom_u32(); + } else { + struct xfs_dinode *dip; + struct xfs_buf *bp; + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); + if (error) + goto out_destroy; + + error = xfs_inode_from_disk(ip, dip); + if (!error) + xfs_buf_set_ref(bp, XFS_INO_REF); + xfs_trans_brelse(tp, bp); + + if (error) + goto out_destroy; } trace_xfs_iget_miss(ip); - /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -557,7 +563,7 @@ xfs_iget_cache_miss( */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; + d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; @@ -590,48 +596,31 @@ out_destroy: } /* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. + * Look up an inode by number in the given file system. The inode is looked up + * in the cache held in each AG. If the inode is found in the cache, initialise + * the vfs inode if necessary. * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. + * If it is not in core, read it in from the file system's device, add it to the + * cache and initialise the vfs inode. * * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. + * Inode lookup is only done during metadata operations and not as part of the + * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. */ int xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + struct xfs_inode **ipp) { - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; + struct xfs_inode *ip; + struct xfs_perag *pag; + xfs_agino_t agino; + int error; - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); /* reject inode numbers outside existing AGs */ @@ -733,25 +722,22 @@ xfs_icache_inode_is_allocated( */ #define XFS_LOOKUP_BATCH 32 -STATIC int -xfs_inode_ag_walk_grab( +/* + * Decide if the given @ip is eligible to be a part of the inode walk, and + * grab it if so. Returns true if it's ready to go or false if we should just + * ignore it. + */ +STATIC bool +xfs_inode_walk_ag_grab( struct xfs_inode *ip, int flags) { struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); + bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); ASSERT(rcu_read_lock_held()); - /* - * check for stale RCU freed inode - * - * If the inode has been reallocated, it doesn't matter if it's not in - * the AG we are walking - we are walking for writeback, so if it - * passes all the "valid inode" checks and is dirty, then we'll write - * it back anyway. If it has been reallocated and still being - * initialised, the XFS_INEW check below will catch it. - */ + /* Check for stale RCU freed inode */ spin_lock(&ip->i_flags_lock); if (!ip->i_ino) goto out_unlock_noent; @@ -764,39 +750,41 @@ xfs_inode_ag_walk_grab( /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EFSCORRUPTED; + return false; /* If we can't grab the inode, it must on it's way to reclaim. */ if (!igrab(inode)) - return -ENOENT; + return false; /* inode is valid */ - return 0; + return true; out_unlock_noent: spin_unlock(&ip->i_flags_lock); - return -ENOENT; + return false; } +/* + * For a given per-AG structure @pag, grab, @execute, and rele all incore + * inodes with the given radix tree @tag. + */ STATIC int -xfs_inode_ag_walk( - struct xfs_mount *mp, +xfs_inode_walk_ag( struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag, - int iter_flags) + int tag) { + struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; int last_error = 0; int skipped; - int done; + bool done; int nr_found; restart: - done = 0; + done = false; skipped = 0; first_index = 0; nr_found = 0; @@ -807,7 +795,7 @@ restart: rcu_read_lock(); - if (tag == -1) + if (tag == XFS_ICI_NO_TAG) nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); @@ -829,7 +817,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) + if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -848,7 +836,7 @@ restart: continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; + done = true; } /* unlock now we've grabbed the inodes. */ @@ -857,10 +845,10 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - if ((iter_flags & XFS_AGITER_INEW_WAIT) && + if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && xfs_iflags_test(batch[i], XFS_INEW)) xfs_inew_wait(batch[i]); - error = execute(batch[i], flags, args); + error = execute(batch[i], args); xfs_irele(batch[i]); if (error == -EAGAIN) { skipped++; @@ -885,6 +873,49 @@ restart: return last_error; } +/* Fetch the next (possibly tagged) per-AG structure. */ +static inline struct xfs_perag * +xfs_inode_walk_get_perag( + struct xfs_mount *mp, + xfs_agnumber_t agno, + int tag) +{ + if (tag == XFS_ICI_NO_TAG) + return xfs_perag_get(mp, agno); + return xfs_perag_get_tag(mp, agno, tag); +} + +/* + * Call the @execute function on all incore inodes matching the radix tree + * @tag. + */ +int +xfs_inode_walk( + struct xfs_mount *mp, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + /* * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). @@ -907,7 +938,12 @@ xfs_eofblocks_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_eofblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; xfs_icache_free_eofblocks(mp, NULL); + sb_end_write(mp->m_super); + xfs_queue_eofblocks(mp); } @@ -934,235 +970,86 @@ xfs_cowblocks_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_cowblocks_work); - xfs_icache_free_cowblocks(mp, NULL); - xfs_queue_cowblocks(mp); -} - -int -xfs_inode_ag_iterator_flags( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args, - int iter_flags) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get(mp, ag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, - iter_flags); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; -} -int -xfs_inode_ag_iterator( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args) -{ - return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); -} - -int -xfs_inode_ag_iterator_tag( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args, - int tag) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; + if (!sb_start_write_trylock(mp->m_super)) + return; + xfs_icache_free_cowblocks(mp, NULL); + sb_end_write(mp->m_super); - ag = 0; - while ((pag = xfs_perag_get_tag(mp, ag, tag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, - 0); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; + xfs_queue_cowblocks(mp); } /* * Grab the inode for reclaim exclusively. - * Return 0 if we grabbed it, non-zero otherwise. + * + * We have found this inode via a lookup under RCU, so the inode may have + * already been freed, or it may be in the process of being recycled by + * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode + * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE + * will not be set. Hence we need to check for both these flag conditions to + * avoid inodes that are no longer reclaim candidates. + * + * Note: checking for other state flags here, under the i_flags_lock or not, is + * racy and should be avoided. Those races should be resolved only after we have + * ensured that we are able to reclaim this inode and the world can see that we + * are going to reclaim it. + * + * Return true if we grabbed it, false otherwise. */ -STATIC int +static bool xfs_reclaim_inode_grab( - struct xfs_inode *ip, - int flags) + struct xfs_inode *ip) { ASSERT(rcu_read_lock_held()); - /* quick check for stale RCU freed inode */ - if (!ip->i_ino) - return 1; - - /* - * If we are asked for non-blocking operation, do unlocked checks to - * see if the inode already is being flushed or in reclaim to avoid - * lock traffic. - */ - if ((flags & SYNC_TRYLOCK) && - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) - return 1; - - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - * - * Due to RCU lookup, we may find inodes that have been freed and only - * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that - * aren't candidates for reclaim at all, so we must check the - * XFS_IRECLAIMABLE is set first before proceeding to reclaim. - */ spin_lock(&ip->i_flags_lock); if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || __xfs_iflags_test(ip, XFS_IRECLAIM)) { /* not a reclaim candidate. */ spin_unlock(&ip->i_flags_lock); - return 1; + return false; } __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); - return 0; + return true; } /* - * Inodes in different states need to be treated differently. The following - * table lists the inode states and the reclaim actions necessary: - * - * inode state iflush ret required action - * --------------- ---------- --------------- - * bad - reclaim - * shutdown EIO unpin and reclaim - * clean, unpinned 0 reclaim - * stale, unpinned 0 reclaim - * clean, pinned(*) 0 requeue - * stale, pinned EAGAIN requeue - * dirty, async - requeue - * dirty, sync 0 reclaim - * - * (*) dgc: I don't think the clean, pinned state is possible but it gets - * handled anyway given the order of checks implemented. - * - * Also, because we get the flush lock first, we know that any inode that has - * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. - * - * Note that because the inode is flushed delayed write by AIL pushing, the - * flush lock may already be held here and waiting on it can result in very - * long latencies. Hence for sync reclaims, where we wait on the flush lock, - * the caller should push the AIL first before trying to reclaim inodes to - * minimise the amount of time spent waiting. For background relaim, we only - * bother to reclaim clean inodes anyway. + * Inode reclaim is non-blocking, so the default action if progress cannot be + * made is to "requeue" the inode for reclaim by unlocking it and clearing the + * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about + * blocking anymore and hence we can wait for the inode to be able to reclaim + * it. * - * Hence the order of actions after gaining the locks should be: - * bad => reclaim - * shutdown => unpin and reclaim - * pinned, async => requeue - * pinned, sync => unpin - * stale => reclaim - * clean => reclaim - * dirty, async => requeue - * dirty, sync => flush, wait and reclaim + * We do no IO here - if callers require inodes to be cleaned they must push the + * AIL first to trigger writeback of dirty inodes. This enables writeback to be + * done in the background in a non-blocking manner, and enables memory reclaim + * to make progress without blocking. */ -STATIC int +static void xfs_reclaim_inode( struct xfs_inode *ip, - struct xfs_perag *pag, - int sync_mode) + struct xfs_perag *pag) { - struct xfs_buf *bp = NULL; xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ - int error; -restart: - error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!xfs_iflock_nowait(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out; - xfs_iflock(ip); - } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + goto out; + if (!xfs_iflock_nowait(ip)) + goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); /* xfs_iflush_abort() drops the flush lock */ - xfs_iflush_abort(ip, false); + xfs_iflush_abort(ip); goto reclaim; } - if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - xfs_iunpin_wait(ip); - } - if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto reclaim; - } - - /* - * Never flush out dirty data during non-blocking reclaim, as it would - * just contend with AIL pushing trying to do the same job. - */ - if (!(sync_mode & SYNC_WAIT)) + if (xfs_ipincount(ip)) + goto out_ifunlock; + if (!xfs_inode_clean(ip)) goto out_ifunlock; - /* - * Now we have an inode that needs flushing. - * - * Note that xfs_iflush will never block on the inode buffer lock, as - * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_imap_to_bp() to get the cluster buffer would - * result in an ABBA deadlock with xfs_ifree_cluster(). - * - * As xfs_ifree_cluser() must gather all inodes that are active in the - * cache to mark them stale, if we hit this case we don't actually want - * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error here, just unlock the - * inode, back off and try again. Hopefully the next pass through will - * see the stale flag set on the inode. - */ - error = xfs_iflush(ip, &bp); - if (error == -EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - - if (!error) { - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - } - + xfs_ifunlock(ip); reclaim: ASSERT(!xfs_isiflocked(ip)); @@ -1209,23 +1096,17 @@ reclaim: xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(xfs_inode_clean(ip)); __xfs_inode_free(ip); - return error; + return; out_ifunlock: xfs_ifunlock(ip); +out_iunlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); out: xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return -EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and the reclaim work never goes back to - * the idle state. Instead, return 0 to let the next scheduled - * background reclaim attempt to reclaim the inode again. - */ - return 0; } /* @@ -1233,23 +1114,19 @@ out: * corrupted, we still want to try to reclaim all the inodes. If we don't, * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. + * + * Returns non-zero if any AGs or inodes were skipped in the reclaim pass + * so that callers that want to block until all dirty inodes are written back + * and reclaimed can sanely loop. */ -STATIC int +static void xfs_reclaim_inodes_ag( struct xfs_mount *mp, - int flags, int *nr_to_scan) { struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - int trylock = flags & SYNC_TRYLOCK; - int skipped; + xfs_agnumber_t ag = 0; -restart: - ag = 0; - skipped = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { unsigned long first_index = 0; int done = 0; @@ -1257,16 +1134,7 @@ restart: ag = pag->pag_agno + 1; - if (trylock) { - if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { - skipped++; - xfs_perag_put(pag); - continue; - } - first_index = pag->pag_ici_reclaim_cursor; - } else - mutex_lock(&pag->pag_ici_reclaim_lock); - + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int i; @@ -1290,7 +1158,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_reclaim_inode_grab(ip, flags)) + if (done || !xfs_reclaim_inode_grab(ip)) batch[i] = NULL; /* @@ -1319,59 +1187,39 @@ restart: rcu_read_unlock(); for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - error = xfs_reclaim_inode(batch[i], pag, flags); - if (error && last_error != -EFSCORRUPTED) - last_error = error; + if (batch[i]) + xfs_reclaim_inode(batch[i], pag); } *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); - if (trylock && !done) - pag->pag_ici_reclaim_cursor = first_index; - else - pag->pag_ici_reclaim_cursor = 0; - mutex_unlock(&pag->pag_ici_reclaim_lock); + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); xfs_perag_put(pag); } - - /* - * if we skipped any AG, and we still have scan count remaining, do - * another pass this time using blocking reclaim semantics (i.e - * waiting on the reclaim locks and ignoring the reclaim cursors). This - * ensure that when we get more reclaimers than AGs we block rather - * than spin trying to execute reclaim. - */ - if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { - trylock = 0; - goto restart; - } - return last_error; } -int +void xfs_reclaim_inodes( - xfs_mount_t *mp, - int mode) + struct xfs_mount *mp) { int nr_to_scan = INT_MAX; - return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + xfs_ail_push_all_sync(mp->m_ail); + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + }; } /* - * Scan a certain number of inodes for reclaim. - * - * When called we make sure that there is a background (fast) inode reclaim in - * progress, while we will throttle the speed of reclaim via doing synchronous - * reclaim of inodes. That means if we come across dirty inodes, we wait for - * them to be cleaned, which we hope will not be very long due to the - * background walker having already kicked the IO off on those dirty inodes. + * The shrinker infrastructure determines how many inodes we should scan for + * reclaim. We want as many clean inodes ready to reclaim as possible, so we + * push the AIL here. We also want to proactively free up memory if we can to + * minimise the amount of work memory reclaim has to do so we kick the + * background reclaim if it isn't already scheduled. */ long xfs_reclaim_inodes_nr( @@ -1382,7 +1230,8 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + return 0; } /* @@ -1405,59 +1254,108 @@ xfs_reclaim_inodes_count( return reclaimable; } -STATIC int +STATIC bool xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) - return 0; + return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) - return 0; + return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && ip->i_d.di_projid != eofb->eof_prid) - return 0; + return false; - return 1; + return true; } /* * A union-based inode filtering algorithm. Process the inode if any of the * criteria match. This is for global/internal scans only. */ -STATIC int +STATIC bool xfs_inode_match_id_union( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) - return 1; + return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) - return 1; + return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && ip->i_d.di_projid == eofb->eof_prid) - return 1; + return true; - return 0; + return false; +} + +/* + * Is this inode @ip eligible for eof/cow block reclamation, given some + * filtering parameters @eofb? The inode is eligible if @eofb is null or + * if the predicate functions match. + */ +static bool +xfs_inode_matches_eofb( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + bool match; + + if (!eofb) + return true; + + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return false; + + /* skip the inode if the file size is too small */ + if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return false; + + return true; +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + int nr_to_scan = INT_MAX; + + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_reclaim_work_queue(mp); } STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - int flags, void *args) { - int ret = 0; - struct xfs_eofblocks *eofb = args; - int match; + struct xfs_eofblocks *eofb = args; + bool wait; + int ret; + + wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1470,62 +1368,34 @@ xfs_inode_free_eofblocks( * If the mapping is dirty the operation can block and wait for some * time. Unless we are waiting, skip it. */ - if (!(flags & SYNC_WAIT) && - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); - else - match = xfs_inode_match_id(ip, eofb); - if (!match) - return 0; - - /* skip the inode if the file size is too small */ - if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && - XFS_ISIZE(ip) < eofb->eof_min_file_size) - return 0; - } + if (!xfs_inode_matches_eofb(ip, eofb)) + return 0; /* * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - if (flags & SYNC_WAIT) - ret = -EAGAIN; - return ret; + if (wait) + return -EAGAIN; + return 0; } + ret = xfs_free_eofblocks(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } -static int -__xfs_icache_free_eofblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int tag) -{ - int flags = SYNC_TRYLOCK; - - if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) - flags = SYNC_WAIT; - - return xfs_inode_ag_iterator_tag(mp, execute, flags, - eofb, tag); -} - int xfs_icache_free_eofblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) { - return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, XFS_ICI_EOFBLOCKS_TAG); } @@ -1552,7 +1422,7 @@ __xfs_inode_free_quota_eofblocks( eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQ_USER); + dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_uid = VFS_I(ip)->i_uid; eofb.eof_flags |= XFS_EOF_FLAGS_UID; @@ -1561,7 +1431,7 @@ __xfs_inode_free_quota_eofblocks( } if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_gid = VFS_I(ip)->i_gid; eofb.eof_flags |= XFS_EOF_FLAGS_GID; @@ -1742,29 +1612,16 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - int flags, void *args) { struct xfs_eofblocks *eofb = args; - int match; int ret = 0; if (!xfs_prep_free_cowblocks(ip)) return 0; - if (eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); - else - match = xfs_inode_match_id(ip, eofb); - if (!match) - return 0; - - /* skip the inode if the file size is too small */ - if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && - XFS_ISIZE(ip) < eofb->eof_min_file_size) - return 0; - } + if (!xfs_inode_matches_eofb(ip, eofb)) + return 0; /* Free the CoW blocks */ xfs_ilock(ip, XFS_IOLOCK_EXCL); @@ -1788,7 +1645,7 @@ xfs_icache_free_cowblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) { - return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 48f1fd2bb6ad..3a4c8b382cd0 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -17,14 +17,11 @@ struct xfs_eofblocks { __u64 eof_min_file_size; }; -#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ -#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ - /* * tags for inode radix tree */ #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_ag_iterator */ + in xfs_inode_walk */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ @@ -40,7 +37,7 @@ struct xfs_eofblocks { /* * flags for AG inode iterator */ -#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ +#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */ int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -51,7 +48,7 @@ void xfs_inode_free(struct xfs_inode *ip); void xfs_reclaim_worker(struct work_struct *work); -int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +void xfs_reclaim_inodes(struct xfs_mount *mp); int xfs_reclaim_inodes_count(struct xfs_mount *mp); long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); @@ -71,50 +68,9 @@ int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *); -int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args); -int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args, int iter_flags); -int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args, int tag); - -static inline int -xfs_fs_eofblocks_from_user( - struct xfs_fs_eofblocks *src, - struct xfs_eofblocks *dst) -{ - if (src->eof_version != XFS_EOFBLOCKS_VERSION) - return -EINVAL; - - if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) - return -EINVAL; - - if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || - memchr_inv(src->pad64, 0, sizeof(src->pad64))) - return -EINVAL; - - dst->eof_flags = src->eof_flags; - dst->eof_prid = src->eof_prid; - dst->eof_min_file_size = src->eof_min_file_size; - - dst->eof_uid = INVALID_UID; - if (src->eof_flags & XFS_EOF_FLAGS_UID) { - dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); - if (!uid_valid(dst->eof_uid)) - return -EINVAL; - } - - dst->eof_gid = INVALID_GID; - if (src->eof_flags & XFS_EOF_FLAGS_GID) { - dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); - if (!gid_valid(dst->eof_gid)) - return -EINVAL; - } - return 0; -} +int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 490fee22b878..9b3994b9c716 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -6,11 +6,19 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_ialloc.h" +#include "xfs_trace.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ @@ -89,7 +97,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_zone_zalloc(xfs_icreate_zone, 0); + icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); @@ -107,3 +115,147 @@ xfs_icreate_log( tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } + +static enum xlog_recover_reorder +xlog_recover_icreate_reorder( + struct xlog_recover_item *item) +{ + /* + * Inode allocation buffers must be replayed before subsequent inode + * items try to modify those buffers. ICREATE items are the logical + * equivalent of logging a newly initialized inode buffer, so recover + * these at the same time that we recover logged buffers. + */ + return XLOG_REORDER_BUFFER_LIST; +} + +/* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be initialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_icreate_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return -EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return -EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return -EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return -EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return -EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return -EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return -EINVAL; + } + + /* + * The inode chunk is either full or sparse and we only support + * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. + */ + if (length != igeo->ialloc_blks && + length != igeo->ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); + return -EINVAL; + } + + /* + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + nbufs = length / igeo->blocks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * igeo->blocks_per_cluster); + if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster)) + cancel_count++; + } + + /* + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. + * + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. + */ + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); + trace_xfs_log_recover_icreate_cancel(log, icl); + return 0; + } + + trace_xfs_log_recover_icreate_recover(log, icl); + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); +} + +const struct xlog_recover_item_ops xlog_icreate_item_ops = { + .item_type = XFS_LI_ICREATE, + .reorder = xlog_recover_icreate_reorder, + .commit_pass2 = xlog_recover_icreate_commit_pass2, +}; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c5077e6326c7..c06129cffba9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -112,7 +111,7 @@ xfs_ilock_data_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -125,7 +124,8 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + if (ip->i_afp && + ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -144,17 +144,17 @@ xfs_ilock_attr_map_shared( * * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * - * mmap_sem locking order: + * mmap_lock locking order: * - * i_rwsem -> page lock -> mmap_sem - * mmap_sem -> i_mmap_lock -> page_lock + * i_rwsem -> page lock -> mmap_lock + * mmap_lock -> i_mmap_lock -> page_lock * - * The difference in mmap_sem locking order mean that we cannot hold the + * The difference in mmap_lock locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * fault in pages during copy in/out (for buffered IO) or require the mmap_lock * in get_user_pages() to map the user pages into the kernel address space for * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_sem. + * page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both @@ -451,7 +451,7 @@ xfs_lock_inodes( /* * Currently supports between 2 and 5 inodes with exclusive locking. We * support an arbitrary depth of locking here, but absolute limits on - * inodes depend on the the type of locking and the limits placed by + * inodes depend on the type of locking and the limits placed by * lockdep annotations in xfs_lock_inumorder. These are all checked by * the asserts. */ @@ -801,26 +801,18 @@ xfs_ialloc( return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_uid = current_fsuid(); inode->i_rdev = rdev; ip->i_d.di_projid = prid; if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; + inode->i_gid = VFS_I(pip)->i_gid; if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current_fsgid(); } /* @@ -828,13 +820,12 @@ xfs_ialloc( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; - ip->i_d.di_nextents = 0; + ip->i_df.if_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_time(inode); @@ -847,21 +838,20 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_format = XFS_DINODE_FMT_DEV; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -907,24 +897,17 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; } /* FALLTHROUGH */ case S_IFLNK: - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; @@ -932,11 +915,6 @@ xfs_ialloc( default: ASSERT(0); } - /* - * Attribute fork settings for new inode. - */ - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_anextents = 0; /* * Log the new values stuffed into the inode. @@ -1122,7 +1100,6 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1158,8 +1135,7 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1219,8 +1195,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -1309,8 +1284,7 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1655,7 +1629,7 @@ xfs_release( return 0; /* * If we can't get the iolock just skip truncating the blocks - * past EOF because we could deadlock with the mmap_sem + * past EOF because we could deadlock with the mmap_lock * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. @@ -1707,7 +1681,7 @@ xfs_inactive_truncate( if (error) goto error_trans_cancel; - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); error = xfs_trans_commit(tp); if (error) @@ -1765,10 +1739,31 @@ xfs_inactive_ifree( return error; } + /* + * We do not hold the inode locked across the entire rolling transaction + * here. We only need to hold it for the first transaction that + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode + * here breaks the relationship between cluster buffer invalidation and + * stale inode invalidation on cluster buffer item journal commit + * completion, and can result in leaving dirty stale inodes hanging + * around in memory. + * + * We have no need for serialising this inode operation against other + * operations - we freed the inode and hence reallocation is required + * and that will serialise on reallocating the space the deferops need + * to free. Hence we can unlock the inode on the first commit of + * the transaction rather than roll it right through the deferops. This + * avoids relogging the XFS_ISTALE inode. + * + * We check that xfs_ifree() hasn't grown an internal transaction roll + * by asserting that the inode is still locked when it returns. + */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1781,7 +1776,6 @@ xfs_inactive_ifree( xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1799,7 +1793,6 @@ xfs_inactive_ifree( xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } @@ -1857,7 +1850,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; error = xfs_qm_dqattach(ip); @@ -1883,7 +1876,6 @@ xfs_inactive( } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_d.di_forkoff == 0); /* @@ -2119,7 +2111,7 @@ xfs_iunlink_update_bucket( unsigned int bucket_index, xfs_agino_t new_agino) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t old_value; int offset; @@ -2135,7 +2127,7 @@ xfs_iunlink_update_bucket( * head of the list. */ if (old_value == new_agino) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } @@ -2173,7 +2165,6 @@ xfs_iunlink_update_dinode( xfs_dinode_calc_crc(mp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - xfs_inobp_check(mp, ibp); } /* Set an in-core inode's unlinked pointer and return the old value. */ @@ -2193,7 +2184,7 @@ xfs_iunlink_update_inode( ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); if (error) return error; @@ -2259,7 +2250,7 @@ xfs_iunlink( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2269,12 +2260,11 @@ xfs_iunlink( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || !xfs_verify_agino_or_null(mp, agno, next_agino)) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } if (next_agino != NULLAGINO) { - struct xfs_perag *pag; xfs_agino_t old_agino; /* @@ -2291,9 +2281,7 @@ xfs_iunlink( * agino has been unlinked, add a backref from the next inode * back to agino. */ - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_add_backref(pag, agino, next_agino); - xfs_perag_put(pag); + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); if (error) return error; } @@ -2323,7 +2311,7 @@ xfs_iunlink_map_ino( return error; } - error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); if (error) { xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); @@ -2429,7 +2417,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2443,7 +2430,7 @@ xfs_iunlink_remove( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2473,32 +2460,22 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_change_backref(pag, next_agino, + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); if (error) - goto out; + return error; } - if (head_agino == agino) { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, - next_agino); - if (error) - goto out; - } else { + if (head_agino != agino) { struct xfs_imap imap; xfs_agino_t prev_agino; - if (!pag) - pag = xfs_perag_get(mp, agno); - /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, - pag); + agibp->b_pag); if (error) - goto out; + return error; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, @@ -2512,15 +2489,106 @@ xfs_iunlink_remove( * change_backref takes care of deleting the backref if * next_agino is NULLAGINO. */ - error = xfs_iunlink_change_backref(pag, agino, next_agino); - if (error) - goto out; + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); } -out: - if (pag) - xfs_perag_put(pag); - return error; + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); +} + +/* + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. + */ +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); + return; + } + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return; + } + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + delay(1); + goto retry; + } + } + ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + /* + * If we can't get the flush lock, the inode is already attached. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. + */ + iip = ip->i_itemp; + if (!xfs_iflock_nowait(ip)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; + } + + /* + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. + */ + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { + xfs_ifunlock(ip); + goto out_iunlock; + } + + /* we have a dirty inode in memory that has not yet been flushed. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + ASSERT(iip->ili_last_fields); + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* @@ -2530,26 +2598,20 @@ out: */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - xfs_inode_log_item_t *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_ino_t inum; int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { @@ -2593,148 +2655,20 @@ xfs_ifree_cluster( bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); - continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ - iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } /* - * Free any local-format buffers sitting around before we reset to - * extents format. - */ -static inline void -xfs_ifree_local_data( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return; - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); -} - -/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2751,11 +2685,11 @@ xfs_ifree( { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(ip->i_d.di_nextents == 0); - ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); @@ -2770,19 +2704,28 @@ xfs_ifree( if (error) return error; - xfs_ifree_local_data(ip, XFS_DATA_FORK); - xfs_ifree_local_data(ip, XFS_ATTR_FORK); + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ip->i_df.if_u1.if_data); + ip->i_df.if_u1.if_data = NULL; + ip->i_df.if_bytes = 0; + } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -3162,7 +3105,7 @@ out_trans_abort: /* * xfs_rename_alloc_whiteout() * - * Return a referenced, unlinked, unlocked inode that that can be used as a + * Return a referenced, unlinked, unlocked inode that can be used as a * whiteout in a rename transaction. We use a tmpfile inode here so that if we * crash between allocating the inode and linking it into the rename transaction * recovery will free the inode and we won't leak it. @@ -3489,374 +3432,76 @@ out_release_wip: return error; } -STATIC int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - if (cip == ip) - continue; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - int error; - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - return 0; - - -cluster_corrupt_out: - /* - * Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - rcu_read_unlock(); - - /* - * We'll always have an inode attached to the buffer for completion - * process by the time we are called from xfs_iflush(). Hence we have - * always need to do IO completion processing to abort the inodes - * attached to the buffer. handle them just like the shutdown case in - * xfs_buf_submit(). - */ - ASSERT(bp->b_iodone); - bp->b_flags |= XBF_ASYNC; - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(cip, false); - kmem_free(cilist); - xfs_perag_put(pag); - return -EFSCORRUPTED; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int +static int xfs_iflush( struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an empty AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - goto abort_out; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, we - * simply want to return with the inode still dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode, so we treat it the same as failing - * xfs_iflush_int(). - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, - 0); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto corrupt_out; - - /* - * First flush out the inode that xfs_iflush was called with. - */ - error = xfs_iflush_int(ip, bp); - if (error) - goto corrupt_out; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * inode clustering: try to gather other inodes into this write - * - * Note: Any error during clustering will result in the filesystem - * being shut down and completion callbacks run on the cluster buffer. - * As we have already flushed and attached this inode to the buffer, - * it has already been aborted and released by xfs_iflush_cluster() and - * so we have no further error handling to do here. - */ - error = xfs_iflush_cluster(ip, bp); - if (error) - return error; - - *bpp = bp; - return 0; - -corrupt_out: - if (bp) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -abort_out: - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(ip, false); - return error; -} - -/* - * If there are inline format data / attr forks attached to this inode, - * make sure they're not corrupt. - */ -bool -xfs_inode_verify_forks( - struct xfs_inode *ip) -{ - struct xfs_ifork *ifp; - xfs_failaddr_t fa; - - fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); - return false; - } - - fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); - return false; - } - return true; -} - -STATIC int -xfs_iflush_int( - struct xfs_inode *ip, struct xfs_buf *bp) { struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(iip->ili_item.li_buf == bp); - /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); + /* + * We don't flush the inode if any of the following checks fail, but we + * do still update the log item and attach to the backing buffer as if + * the flush happened. This is a formality to facilitate predictable + * error handling as the caller will shutdown and fail the buffer. + */ + error = -EFSCORRUPTED; if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); - goto corrupt_out; + goto flush_out; } if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, ip); - goto corrupt_out; + goto flush_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); - goto corrupt_out; + goto flush_out; } /* @@ -3868,12 +3513,19 @@ xfs_iflush_int( * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; - /* Check the inline fork data before we write out. */ - if (!xfs_inode_verify_forks(ip)) - goto corrupt_out; + /* + * If there are inline format data / attr forks attached to this inode, + * make sure they are not corrupt. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_data(ip)) + goto flush_out; + if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_attr(ip)) + goto flush_out; /* * Copy the dirty parts of the inode into the on-disk inode. We always @@ -3889,7 +3541,6 @@ xfs_iflush_int( xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear @@ -3906,41 +3557,148 @@ xfs_iflush_int( * know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. */ + error = 0; +flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_iflush_done(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + return error; +} + +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; + int clcount = 0; + int error = 0; + /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; - /* generate the checksum. */ - xfs_dinode_calc_crc(mp, dip); + /* + * Quick and dirty check to avoid locks if possible. + */ + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + continue; + if (xfs_ipincount(ip)) + continue; + + /* + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + + /* + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); - ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); + /* + * Skip inodes that are already flush locked as they have + * already been written to the buffer. + */ + if (!xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + /* + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; + continue; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + break; + clcount++; + } + + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); return 0; -corrupt_out: - return -EFSCORRUPTED; } /* Release an inode. */ @@ -3951,3 +3709,115 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 492e53992fa9..e9a8bb184d1f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -57,9 +57,6 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ - xfs_extnum_t i_cnextents; /* # of extents in cow fork */ - unsigned int i_cformat; /* format of cow fork */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -218,8 +215,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) -#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ -#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during @@ -426,10 +422,11 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); @@ -466,6 +463,7 @@ int xfs_break_layouts(struct inode *inode, uint *iolock, /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); /* * When setting up a newly allocated inode, we need to call @@ -496,11 +494,12 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 -bool xfs_inode_verify_forks(struct xfs_inode *ip); - int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); void xfs_end_io(struct work_struct *work); +int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 8bd5d0de6321..6c65938cee1c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -36,10 +36,10 @@ xfs_inode_item_data_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && - ip->i_d.di_nextents > 0 && + ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { /* worst case, doesn't subtract delalloc extents */ *nbytes += XFS_IFORK_DSIZE(ip); @@ -77,10 +77,10 @@ xfs_inode_item_attr_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_d.di_aformat) { + switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_d.di_anextents > 0 && + ip->i_afp->if_nextents > 0 && ip->i_afp->if_bytes > 0) { /* worst case, doesn't subtract unused space */ *nbytes += XFS_IFORK_ASIZE(ip); @@ -125,7 +125,7 @@ xfs_inode_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_log_dinode_size(ip->i_d.di_version); + xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); if (XFS_IFORK_Q(ip)) @@ -142,13 +142,13 @@ xfs_inode_item_format_data_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DEXT) && - ip->i_d.di_nextents > 0 && + ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { struct xfs_bmbt_rec *p; @@ -191,7 +191,7 @@ xfs_inode_item_format_data_fork( ip->i_df.if_bytes > 0) { /* * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed to + * The underlying memory is guaranteed * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_df.if_bytes, 4); @@ -227,18 +227,18 @@ xfs_inode_item_format_attr_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_d.di_aformat) { + switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_d.di_anextents > 0 && + ip->i_afp->if_nextents > 0 && ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; ASSERT(xfs_iext_count(ip->i_afp) == - ip->i_d.di_anextents); + ip->i_afp->if_nextents); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -275,7 +275,7 @@ xfs_inode_item_format_attr_fork( ip->i_afp->if_bytes > 0) { /* * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed to + * The underlying memory is guaranteed * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_afp->if_bytes, 4); @@ -305,11 +305,9 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_uid = from->di_uid; - to->di_gid = from->di_gid; + to->di_format = xfs_ifork_format(&ip->i_df); + to->di_uid = i_uid_read(inode); + to->di_gid = i_gid_read(inode); to->di_projid_lo = from->di_projid & 0xffff; to->di_projid_hi = from->di_projid >> 16; @@ -328,10 +326,10 @@ xfs_inode_to_log_dinode( to->di_size = from->di_size; to->di_nblocks = from->di_nblocks; to->di_extsize = from->di_extsize; - to->di_nextents = from->di_nextents; - to->di_anextents = from->di_anextents; + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; + to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_dmevmask = from->di_dmevmask; to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; @@ -339,7 +337,8 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); to->di_crtime.t_sec = from->di_crtime.tv_sec; to->di_crtime.t_nsec = from->di_crtime.tv_nsec; @@ -351,6 +350,7 @@ xfs_inode_to_log_dinode( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = from->di_flushiter; } } @@ -370,7 +370,7 @@ xfs_inode_item_format_core( dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE); xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); - xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version)); + xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount)); } /* @@ -395,8 +395,6 @@ xfs_inode_item_format( struct xfs_log_iovec *vecp = NULL; struct xfs_inode_log_format *ilf; - ASSERT(ip->i_d.di_version > 1); - ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); ilf->ilf_type = XFS_LI_INODE; ilf->ilf_ino = ip->i_ino; @@ -441,6 +439,7 @@ xfs_inode_item_pin( struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(lip->li_buf); trace_xfs_inode_pin(ip, _RET_IP_); atomic_inc(&ip->i_pincount); @@ -452,6 +451,12 @@ xfs_inode_item_pin( * item which was previously pinned with a call to xfs_inode_item_pin(). * * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. + * + * Note that unpin can race with inode cluster buffer freeing marking the buffer + * stale. In that case, flush completions are run from the buffer unpin call, + * which may happen before the inode is unpinned. If we lose the race, there + * will be no buffer attached to the log item, but the inode will be marked + * XFS_ISTALE. */ STATIC void xfs_inode_item_unpin( @@ -461,28 +466,12 @@ xfs_inode_item_unpin( struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer - * have been failed during writeback - * - * This informs the AIL that the inode is already flush locked on the next push, - * and acquires a hold on the buffer to ensure that it isn't reclaimed before - * dirty data makes it to disk. - */ -STATIC void -xfs_inode_item_error( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); - xfs_set_li_failed(lip, bp); -} - STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -496,69 +485,44 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - if (xfs_ipincount(ip) > 0) - return XFS_ITEM_PINNED; - - /* - * The buffer containing this item failed to be written back - * previously. Resubmit the buffer for IO. - */ - if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { - if (!xfs_buf_trylock(bp)) - return XFS_ITEM_LOCKED; + ASSERT(iip->ili_item.li_buf); - if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) - rval = XFS_ITEM_FLUSHING; + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || + (ip->i_flags & XFS_ISTALE)) + return XFS_ITEM_PINNED; - xfs_buf_unlock(bp); - return rval; - } + /* If the inode is already flush locked, we're already flushing. */ + if (xfs_isiflocked(ip)) + return XFS_ITEM_FLUSHING; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - /* - * Re-check the pincount now that we stabilized the value by - * taking the ilock. - */ - if (xfs_ipincount(ip) > 0) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } - - /* - * Stale inode items should force out the iclog. - */ - if (ip->i_flags & XFS_ISTALE) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } + spin_unlock(&lip->li_ailp->ail_lock); /* - * Someone else is already flushing the inode. Nothing we can do - * here but wait for the flush to finish and remove the item from - * the AIL. + * We need to hold a reference for flushing the cluster buffer as it may + * fail the buffer without IO submission. In which case, we better get a + * reference for that completion because otherwise we don't get a + * reference for IO until we queue the buffer for delwri submission. */ - if (!xfs_iflock_nowait(ip)) { - rval = XFS_ITEM_FLUSHING; - goto out_unlock; - } - - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - - spin_unlock(&lip->li_ailp->ail_lock); - - error = xfs_iflush(ip, &bp); + xfs_buf_hold(bp); + error = xfs_iflush_cluster(bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); + } else { + /* + * Release the buffer if we were unable to flush anything. On + * any other error, the buffer has already been released. + */ + if (error == -EAGAIN) + xfs_buf_relse(bp); + rval = XFS_ITEM_LOCKED; } spin_lock(&lip->li_ailp->ail_lock); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; } @@ -637,7 +601,6 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, .iop_committing = xfs_inode_item_committing, - .iop_error = xfs_inode_item_error }; @@ -652,9 +615,11 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); + iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone, + GFP_KERNEL | __GFP_NOFAIL); iip->ili_inode = ip; + spin_lock_init(&iip->ili_lock); xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); } @@ -664,112 +629,129 @@ xfs_inode_item_init( */ void xfs_inode_item_destroy( - xfs_inode_t *ip) + struct xfs_inode *ip) { - kmem_free(ip->i_itemp->ili_item.li_lv_shadow); - kmem_cache_free(xfs_ili_zone, ip->i_itemp); + struct xfs_inode_log_item *iip = ip->i_itemp; + + ASSERT(iip->ili_item.li_buf == NULL); + + ip->i_itemp = NULL; + kmem_free(iip->ili_item.li_lv_shadow); + kmem_cache_free(xfs_ili_zone, iip); } /* - * This is the inode flushing I/O completion routine. It is called - * from interrupt level when the buffer containing the inode is - * flushed to disk. It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. - * - * To reduce AIL lock traffic as much as possible, we scan the buffer log item - * list for other inodes that will run this function. We remove them from the - * buffer list so we can process all the inode IO completions in one AIL lock - * traversal. + * We only want to pull the item from the AIL if it is actually there + * and its location in the log has not changed since we started the + * flush. Thus, we only bother if the inode's lsn has not changed. */ -void -xfs_iflush_done( - struct xfs_buf *bp, - struct xfs_log_item *lip) +static void +xfs_iflush_ail_updates( + struct xfs_ail *ailp, + struct list_head *list) { - struct xfs_inode_log_item *iip; - struct xfs_log_item *blip, *n; - struct xfs_ail *ailp = lip->li_ailp; - int need_ail = 0; - LIST_HEAD(tmp); + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn = 0; - /* - * Scan the buffer IO completions for other inodes being completed and - * attach them to the current inode log item. - */ - - list_add_tail(&lip->li_bio_list, &tmp); + /* this is an opencoded batch version of xfs_trans_ail_delete */ + spin_lock(&ailp->ail_lock); + list_for_each_entry(lip, list, li_bio_list) { + xfs_lsn_t lsn; - list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) { - if (lip->li_cb != xfs_iflush_done) + clear_bit(XFS_LI_FAILED, &lip->li_flags); + if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) continue; - list_move_tail(&blip->li_bio_list, &tmp); + lsn = xfs_ail_delete_one(ailp, lip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } + xfs_ail_update_finish(ailp, tail_lsn); +} + +/* + * Walk the list of inodes that have completed their IOs. If they are clean + * remove them from the list and dissociate them from the buffer. Buffers that + * are still dirty remain linked to the buffer and on the list. Caller must + * handle them appropriately. + */ +static void +xfs_iflush_finish( + struct xfs_buf *bp, + struct list_head *list) +{ + struct xfs_log_item *lip, *n; + + list_for_each_entry_safe(lip, n, list, li_bio_list) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + bool drop_buffer = false; + + spin_lock(&iip->ili_lock); + /* - * while we have the item, do the unlocked check for needing - * the AIL lock. + * Remove the reference to the cluster buffer if the inode is + * clean in memory and drop the buffer reference once we've + * dropped the locks we hold. */ - iip = INODE_ITEM(blip); - if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - test_bit(XFS_LI_FAILED, &blip->li_flags)) - need_ail++; + ASSERT(iip->ili_item.li_buf == bp); + if (!iip->ili_fields) { + iip->ili_item.li_buf = NULL; + list_del_init(&lip->li_bio_list); + drop_buffer = true; + } + iip->ili_last_fields = 0; + iip->ili_flush_lsn = 0; + spin_unlock(&iip->ili_lock); + xfs_ifunlock(iip->ili_inode); + if (drop_buffer) + xfs_buf_rele(bp); } +} - /* make sure we capture the state of the initial inode. */ - iip = INODE_ITEM(lip); - if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || - test_bit(XFS_LI_FAILED, &lip->li_flags)) - need_ail++; +/* + * Inode buffer IO completion routine. It is responsible for removing inodes + * attached to the buffer from the AIL if they have not been re-logged, as well + * as completing the flush and unlocking the inode. + */ +void +xfs_iflush_done( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip, *n; + LIST_HEAD(flushed_inodes); + LIST_HEAD(ail_updates); /* - * We only want to pull the item from the AIL if it is - * actually there and its location in the log has not - * changed since we started the flush. Thus, we only bother - * if the ili_logged flag is set and the inode's lsn has not - * changed. First we check the lsn outside - * the lock since it's cheaper, and then we recheck while - * holding the lock before removing the inode from the AIL. + * Pull the attached inodes from the buffer one at a time and take the + * appropriate action on them. */ - if (need_ail) { - bool mlip_changed = false; - - /* this is an opencoded batch version of xfs_trans_ail_delete */ - spin_lock(&ailp->ail_lock); - list_for_each_entry(blip, &tmp, li_bio_list) { - if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { - xfs_clear_li_failed(blip); - } - } + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { + xfs_iflush_abort(iip->ili_inode); + continue; } - spin_unlock(&ailp->ail_lock); + if (!iip->ili_last_fields) + continue; - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + /* Do an unlocked check for needing the AIL lock. */ + if (iip->ili_flush_lsn == lip->li_lsn || + test_bit(XFS_LI_FAILED, &lip->li_flags)) + list_move_tail(&lip->li_bio_list, &ail_updates); + else + list_move_tail(&lip->li_bio_list, &flushed_inodes); } - /* - * clean up and unlock the flush lock now we are done. We can clear the - * ili_last_fields bits now that we know that the data corresponding to - * them is safely on disk. - */ - list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { - list_del_init(&blip->li_bio_list); - iip = INODE_ITEM(blip); - iip->ili_logged = 0; - iip->ili_last_fields = 0; - xfs_ifunlock(iip->ili_inode); + if (!list_empty(&ail_updates)) { + xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates); + list_splice_tail(&ail_updates, &flushed_inodes); } - list_del(&tmp); + + xfs_iflush_finish(bp, &flushed_inodes); + if (!list_empty(&flushed_inodes)) + list_splice_tail(&flushed_inodes, &bp->b_li_list); } /* @@ -780,42 +762,37 @@ xfs_iflush_done( */ void xfs_iflush_abort( - xfs_inode_t *ip, - bool stale) + struct xfs_inode *ip) { - xfs_inode_log_item_t *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp = NULL; if (iip) { - if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { - xfs_trans_ail_remove(&iip->ili_item, - stale ? SHUTDOWN_LOG_IO_ERROR : - SHUTDOWN_CORRUPT_INCORE); - } - iip->ili_logged = 0; /* - * Clear the ili_last_fields bits now that we know that the - * data corresponding to them is safely on disk. + * Clear the failed bit before removing the item from the AIL so + * xfs_trans_ail_delete() doesn't try to clear and release the + * buffer attached to the log item before we are done with it. */ - iip->ili_last_fields = 0; + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + /* * Clear the inode logging fields so no more flushes are * attempted. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = 0; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + bp = iip->ili_item.li_buf; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); + spin_unlock(&iip->ili_lock); } - /* - * Release the inode's flush lock since we're done with it. - */ xfs_ifunlock(ip); -} - -void -xfs_istale_done( - struct xfs_buf *bp, - struct xfs_log_item *lip) -{ - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); + if (bp) + xfs_buf_rele(bp); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 07a60e74c39c..048b5e7dee90 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -13,28 +13,38 @@ struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; -typedef struct xfs_inode_log_item { +struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ - xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_lock_flags; /* lock flags */ - unsigned short ili_logged; /* flushed logged data */ + unsigned short ili_lock_flags; /* inode lock flags */ + /* + * The ili_lock protects the interactions between the dirty state and + * the flush state of the inode log item. This allows us to do atomic + * modifications of multiple state fields without having to hold a + * specific inode lock to serialise them. + * + * We need atomic changes between inode dirtying, inode flushing and + * inode completion, but these all hold different combinations of + * ILOCK and iflock and hence we need some other method of serialising + * updates to the flush state. + */ + spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ -} xfs_inode_log_item_t; + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ +}; -static inline int xfs_inode_clean(xfs_inode_t *ip) +static inline int xfs_inode_clean(struct xfs_inode *ip) { return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); } extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *, bool); +extern void xfs_iflush_done(struct xfs_buf *); +extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c new file mode 100644 index 000000000000..5e0d291835b3 --- /dev/null +++ b/fs/xfs/xfs_inode_item_recover.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, + &xfs_inode_buf_ra_ops); + } else { + struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, + &xfs_inode_buf_ra_ops); + } +} + +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return -ENOMEM; + + /* instantiate the inode */ + ASSERT(dip->di_version >= 3); + + error = xfs_inode_from_disk(ip, dip); + if (error) + goto out_free_ip; + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_inode_log_format *in_f; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + struct xfs_dinode *dip; + int len; + char *src; + char *dest; + int error; + int attr_index; + uint fields; + struct xfs_log_dinode *ldip; + uint isize; + int need_free = 0; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + in_f = item->ri_buf[0].i_addr; + } else { + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); + need_free = 1; + error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); + if (error) + goto error; + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { + error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); + goto error; + } + trace_xfs_log_recover_inode_recover(log, in_f); + + error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + 0, &bp, &xfs_inode_buf_ops); + if (error) + goto error; + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + ldip = item->ri_buf[1].i_addr; + if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + __func__, item, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && + ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } + } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; + + if (unlikely(S_ISREG(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } else if (unlikely(S_ISDIR(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE) && + (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } + if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, + ldip->di_nextents + ldip->di_anextents, + ldip->di_nblocks); + error = -EFSCORRUPTED; + goto out_release; + } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + error = -EFSCORRUPTED; + goto out_release; + } + isize = xfs_log_dinode_size(mp); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr "PTR_FMT, + __func__, item->ri_buf[1].i_len, item); + error = -EFSCORRUPTED; + goto out_release; + } + + /* recover the log dinode inode into the on disk inode */ + xfs_log_dinode_to_disk(ldip, dip); + + fields = in_f->ilf_fields; + if (fields & XFS_ILOG_DEV) + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); + + if (in_f->ilf_size == 2) + goto out_owner_change; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(XFS_DFORK_DPTR(dip), src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (struct xfs_bmdr_block *)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); + ASSERT(0); + error = -EFSCORRUPTED; + goto out_release; + } + } + +out_owner_change: + /* Recover the swapext owner change unless inode has been deleted */ + if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && + (dip->di_mode != 0)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + + ASSERT(bp->b_mount == mp); + bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); +error: + if (need_free) + kmem_free(in_f); + return error; +} + +const struct xlog_recover_item_ops xlog_inode_item_ops = { + .item_type = XFS_LI_INODE, + .ra_pass2 = xlog_recover_inode_ra_pass2, + .commit_pass2 = xlog_recover_inode_commit_pass2, +}; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42de92cb283..6f22a66777cd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -35,6 +35,8 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/mount.h> #include <linux/namei.h> @@ -292,62 +294,173 @@ xfs_readlink_by_handle( return error; } -STATIC int -xfs_attrlist_by_handle( - struct file *parfilp, - void __user *arg) +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +static void +xfs_ioc_attr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) { - int error = -ENOMEM; - attrlist_cursor_kern_t *cursor; - struct xfs_fsop_attrlist_handlereq __user *p = arg; - xfs_fsop_attrlist_handlereq_t al_hreq; - struct dentry *dentry; - char *kbuf; + struct xfs_attrlist *alist = context->buffer; + struct xfs_attrlist_ent *aep; + int arraytop; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) - return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) + ASSERT(!context->seen_enough); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + + /* decrement by the actual bytes used by the attr */ + context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + + namelen + 1, sizeof(uint32_t)); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return; + } + + aep = context->buffer + context->firstu; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); +} + +static unsigned int +xfs_attr_filter( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_ROOT) + return XFS_ATTR_ROOT; + if (ioc_flags & XFS_IOC_ATTR_SECURE) + return XFS_ATTR_SECURE; + return 0; +} + +static unsigned int +xfs_attr_flags( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_CREATE) + return XATTR_CREATE; + if (ioc_flags & XFS_IOC_ATTR_REPLACE) + return XATTR_REPLACE; + return 0; +} + +int +xfs_ioc_attr_list( + struct xfs_inode *dp, + void __user *ubuf, + int bufsize, + int flags, + struct xfs_attrlist_cursor __user *ucursor) +{ + struct xfs_attr_list_context context = { }; + struct xfs_attrlist *alist; + void *buffer; + int error; + + if (bufsize < sizeof(struct xfs_attrlist) || + bufsize > XFS_XATTR_LIST_MAX) return -EINVAL; /* * Reject flags, only allow namespaces. */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) return -EINVAL; - dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + /* + * Validate the cursor. + */ + if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) + return -EFAULT; + if (context.cursor.pad1 || context.cursor.pad2) + return -EINVAL; + if (!context.cursor.initted && + (context.cursor.hashval || context.cursor.blkno || + context.cursor.offset)) + return -EINVAL; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; + buffer = kmem_zalloc_large(bufsize, 0); + if (!buffer) + return -ENOMEM; - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); + /* + * Initialize the output buffer. + */ + context.dp = dp; + context.resynch = 1; + context.attr_filter = xfs_attr_filter(flags); + context.buffer = buffer; + context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.firstu = context.bufsize; + context.put_listent = xfs_ioc_attr_put_listent; + + alist = context.buffer; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list(&context); if (error) - goto out_kfree; + goto out_free; - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + if (copy_to_user(ubuf, buffer, bufsize) || + copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) error = -EFAULT; - goto out_kfree; - } +out_free: + kmem_free(buffer); + return error; +} - if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) - error = -EFAULT; +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p) +{ + struct xfs_fsop_attrlist_handlereq al_hreq; + struct dentry *dentry; + int error = -ENOMEM; -out_kfree: - kmem_free(kbuf); -out_dput: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) + return -EFAULT; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, + al_hreq.buflen, al_hreq.flags, &p->pos); dput(dentry); return error; } -int +static int xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, @@ -355,31 +468,33 @@ xfs_attrmulti_attr_get( uint32_t *len, uint32_t flags) { - unsigned char *kbuf; - int error = -EFAULT; - size_t namelen; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + .valuelen = *len, + }; + int error; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, 0); - if (!kbuf) - return -ENOMEM; - namelen = strlen(name); - error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, - flags); + error = xfs_attr_get(&args); if (error) goto out_kfree; - if (copy_to_user(ubuf, kbuf, *len)) + *len = args.valuelen; + if (copy_to_user(ubuf, args.value, args.valuelen)) error = -EFAULT; out_kfree: - kmem_free(kbuf); + kmem_free(args.value); return error; } -int +static int xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, @@ -387,42 +502,75 @@ xfs_attrmulti_attr_set( uint32_t len, uint32_t flags) { - unsigned char *kbuf; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + }; int error; - size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - if (len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - kbuf = memdup_user(ubuf, len); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + if (ubuf) { + if (len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + args.value = memdup_user(ubuf, len); + if (IS_ERR(args.value)) + return PTR_ERR(args.value); + args.valuelen = len; + } - namelen = strlen(name); - error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); - if (!error) - xfs_forget_acl(inode, name, flags); - kfree(kbuf); + error = xfs_attr_set(&args); + if (!error && (flags & XFS_IOC_ATTR_ROOT)) + xfs_forget_acl(inode, name); + kfree(args.value); return error; } int -xfs_attrmulti_attr_remove( +xfs_ioc_attrmulti_one( + struct file *parfilp, struct inode *inode, - unsigned char *name, + uint32_t opcode, + void __user *uname, + void __user *value, + uint32_t *len, uint32_t flags) { + unsigned char *name; int error; - size_t namelen; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - namelen = strlen(name); - error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); - if (!error) - xfs_forget_acl(inode, name, flags); + if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + name = strndup_user(uname, MAXNAMELEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch (opcode) { + case ATTR_OP_GET: + error = xfs_attrmulti_attr_get(inode, name, value, len, flags); + break; + case ATTR_OP_REMOVE: + value = NULL; + *len = 0; + /* fall through */ + case ATTR_OP_SET: + error = mnt_want_write_file(parfilp); + if (error) + break; + error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); + mnt_drop_write_file(parfilp); + break; + default: + error = -EINVAL; + break; + } + + kfree(name); return error; } @@ -436,7 +584,6 @@ xfs_attrmulti_by_handle( xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -462,63 +609,17 @@ xfs_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - if ((ops[i].am_flags & ATTR_ROOT) && - (ops[i].am_flags & ATTR_SECURE)) { - ops[i].am_error = -EINVAL; - continue; - } - ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; - - ops[i].am_error = strncpy_from_user((char *)attr_name, - ops[i].am_attrname, MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, &ops[i].am_length, - ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, ops[i].am_length, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, + ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(am_hreq.ops, ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); @@ -974,13 +1075,18 @@ xfs_merge_ioc_xflags( xflags |= FS_XFLAG_NODUMP; else xflags &= ~FS_XFLAG_NODUMP; + if (flags & FS_DAX_FL) + xflags |= FS_XFLAG_DAX; + else + xflags &= ~FS_XFLAG_DAX; return xflags; } STATIC unsigned int xfs_di2lxflags( - uint16_t di_flags) + uint16_t di_flags, + uint64_t di_flags2) { unsigned int flags = 0; @@ -994,6 +1100,9 @@ xfs_di2lxflags( flags |= FS_NOATIME_FL; if (di_flags & XFS_DIFLAG_NODUMP) flags |= FS_NODUMP_FL; + if (di_flags2 & XFS_DIFLAG2_DAX) { + flags |= FS_DAX_FL; + } return flags; } @@ -1003,26 +1112,17 @@ xfs_fill_fsxattr( bool attr, struct fsxattr *fa) { + struct xfs_ifork *ifp = attr ? ip->i_afp : &ip->i_df; + simple_fill_fsxattr(fa, xfs_ip2xflags(ip)); fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_projid = ip->i_d.di_projid; - - if (attr) { - if (ip->i_afp) { - if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa->fsx_nextents = xfs_iext_count(ip->i_afp); - else - fa->fsx_nextents = ip->i_d.di_anextents; - } else - fa->fsx_nextents = 0; - } else { - if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa->fsx_nextents = xfs_iext_count(&ip->i_df); - else - fa->fsx_nextents = ip->i_d.di_nextents; - } + if (ifp && (ifp->if_flags & XFS_IFEXTENTS)) + fa->fsx_nextents = xfs_iext_count(ifp); + else + fa->fsx_nextents = xfs_ifork_nextents(ifp); } STATIC int @@ -1100,37 +1200,6 @@ xfs_flags2diflags2( return di_flags2; } -STATIC void -xfs_diflags_to_linux( - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - unsigned int xflags = xfs_ip2xflags(ip); - - if (xflags & FS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (xflags & FS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (xflags & FS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (xflags & FS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; -#if 0 /* disabled until the flag switching races are sorted out */ - if (xflags & FS_XFLAG_DAX) - inode->i_flags |= S_DAX; - else - inode->i_flags &= ~S_DAX; -#endif -} - static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, @@ -1141,7 +1210,7 @@ xfs_ioctl_setattr_xflags( uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + if ((ip->i_df.if_nextents || ip->i_delayed_blks) && XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; @@ -1162,77 +1231,39 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (di_flags2 && ip->i_d.di_version < 3) + if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) return -EINVAL; ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_d.di_flags2 = di_flags2; - xfs_diflags_to_linux(ip); + xfs_diflags_to_iflags(ip, false); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); return 0; } -/* - * If we are changing DAX flags, we have to ensure the file is clean and any - * cached objects in the address space are invalidated and removed. This - * requires us to lock out other IO and page faults similar to a truncate - * operation. The locks need to be held until the transaction has been committed - * so that the cache invalidation is atomic with respect to the DAX flag - * manipulation. - */ -static int -xfs_ioctl_setattr_dax_invalidate( +static void +xfs_ioctl_setattr_prepare_dax( struct xfs_inode *ip, - struct fsxattr *fa, - int *join_flags) + struct fsxattr *fa) { - struct inode *inode = VFS_I(ip); - struct super_block *sb = inode->i_sb; - int error; - - *join_flags = 0; - - /* - * It is only valid to set the DAX flag on regular files and - * directories on filesystems where the block size is equal to the page - * size. On directories it serves as an inherited hint so we don't - * have to check the device for dax support or flush pagecache. - */ - if (fa->fsx_xflags & FS_XFLAG_DAX) { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - - if (!bdev_dax_supported(target->bt_bdev, sb->s_blocksize)) - return -EINVAL; - } - - /* If the DAX state is not changing, we have nothing to do here. */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode)) - return 0; - if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode)) - return 0; + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); if (S_ISDIR(inode->i_mode)) - return 0; + return; - /* lock, flush and invalidate mapping in preparation for flag change */ - xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); - error = filemap_write_and_wait(inode->i_mapping); - if (error) - goto out_unlock; - error = invalidate_inode_pages2(inode->i_mapping); - if (error) - goto out_unlock; - - *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL; - return 0; - -out_unlock: - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); - return error; + if ((mp->m_flags & XFS_MOUNT_DAX_ALWAYS) || + (mp->m_flags & XFS_MOUNT_DAX_NEVER)) + return; + if (((fa->fsx_xflags & FS_XFLAG_DAX) && + !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) || + (!(fa->fsx_xflags & FS_XFLAG_DAX) && + (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))) + d_mark_dontcache(inode); } /* @@ -1240,17 +1271,10 @@ out_unlock: * have permission to do so. On success, return a clean transaction and the * inode locked exclusively ready for further operation specific checks. On * failure, return an error without modifying or locking the inode. - * - * The inode might already be IO locked on call. If this is the case, it is - * indicated in @join_flags and we take full responsibility for ensuring they - * are unlocked from now on. Hence if we have an error here, we still have to - * unlock them. Otherwise, once they are joined to the transaction, they will - * be unlocked on commit/cancel. */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip, - int join_flags) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -1267,8 +1291,7 @@ xfs_ioctl_setattr_get_trans( goto out_unlock; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); - join_flags = 0; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * CAP_FOWNER overrides the following restrictions: @@ -1289,8 +1312,6 @@ xfs_ioctl_setattr_get_trans( out_cancel: xfs_trans_cancel(tp); out_unlock: - if (join_flags) - xfs_iunlock(ip, join_flags); return ERR_PTR(error); } @@ -1319,7 +1340,7 @@ xfs_ioctl_setattr_check_extsize( xfs_extlen_t size; xfs_fsblock_t extsize_fsb; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) return -EINVAL; @@ -1372,8 +1393,7 @@ xfs_ioctl_setattr_check_cowextsize( if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || - ip->i_d.di_version != 3) + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) return -EINVAL; if (fa->fsx_cowextsize == 0) @@ -1413,11 +1433,9 @@ xfs_ioctl_setattr( struct fsxattr old_fa; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_dquot *udqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; - int join_flags = 0; trace_xfs_ioctl_setattr(ip); @@ -1434,25 +1452,16 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, - ip->i_d.di_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + VFS_I(ip)->i_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); if (code) return code; } - /* - * Changing DAX config may require inode locking for mapping - * invalidation. These need to be held all the way to transaction commit - * or cancel time, so need to be passed through to - * xfs_ioctl_setattr_get_trans() so it can apply them to the join call - * appropriately. - */ - code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags); - if (code) - goto error_free_dquots; + xfs_ioctl_setattr_prepare_dax(ip, fa); - tp = xfs_ioctl_setattr_get_trans(ip, join_flags); + tp = xfs_ioctl_setattr_get_trans(ip); if (IS_ERR(tp)) { code = PTR_ERR(tp); goto error_free_dquots; @@ -1460,7 +1469,7 @@ xfs_ioctl_setattr( if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && ip->i_d.di_projid != fa->fsx_projid) { - code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, + code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_trans_cancel; @@ -1501,7 +1510,6 @@ xfs_ioctl_setattr( olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } - ASSERT(ip->i_d.di_version > 1); ip->i_d.di_projid = fa->fsx_projid; } @@ -1514,7 +1522,7 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; - if (ip->i_d.di_version == 3 && + if (xfs_sb_version_has_v3inode(&mp->m_sb) && (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) ip->i_d.di_cowextsize = fa->fsx_cowextsize >> mp->m_sb.sb_blocklog; @@ -1527,7 +1535,6 @@ xfs_ioctl_setattr( * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot); - xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; @@ -1535,7 +1542,6 @@ xfs_ioctl_setattr( error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: - xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; } @@ -1567,7 +1573,7 @@ xfs_ioc_getxflags( { unsigned int flags; - flags = xfs_di2lxflags(ip->i_d.di_flags); + flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2); if (copy_to_user(arg, &flags, sizeof(flags))) return -EFAULT; return 0; @@ -1583,7 +1589,6 @@ xfs_ioc_setxflags( struct fsxattr fa; struct fsxattr old_fa; unsigned int flags; - int join_flags = 0; int error; if (copy_from_user(&flags, arg, sizeof(flags))) @@ -1591,7 +1596,7 @@ xfs_ioc_setxflags( if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL)) + FS_SYNC_FL | FS_DAX_FL)) return -EOPNOTSUPP; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); @@ -1600,18 +1605,9 @@ xfs_ioc_setxflags( if (error) return error; - /* - * Changing DAX config may require inode locking for mapping - * invalidation. These need to be held all the way to transaction commit - * or cancel time, so need to be passed through to - * xfs_ioctl_setattr_get_trans() so it can apply them to the join call - * appropriately. - */ - error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags); - if (error) - goto out_drop_write; + xfs_ioctl_setattr_prepare_dax(ip, &fa); - tp = xfs_ioctl_setattr_get_trans(ip, join_flags); + tp = xfs_ioctl_setattr_get_trans(ip); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; @@ -1983,6 +1979,41 @@ out: return error; } +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return -EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return -EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return -EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return -EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return -EINVAL; + } + return 0; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -2264,7 +2295,10 @@ xfs_file_ioctl( if (error) return error; - return xfs_icache_free_eofblocks(mp, &keofb); + sb_start_write(mp->m_super); + error = xfs_icache_free_eofblocks(mp, &keofb); + sb_end_write(mp->m_super); + return error; } default: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 420bd95dc326..bab6a5a92407 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -6,6 +6,11 @@ #ifndef __XFS_IOCTL_H__ #define __XFS_IOCTL_H__ +struct xfs_bstat; +struct xfs_ibulk; +struct xfs_inogrp; + + extern int xfs_ioc_space( struct file *filp, @@ -30,27 +35,11 @@ xfs_readlink_by_handle( struct file *parfilp, xfs_fsop_handlereq_t *hreq); -extern int -xfs_attrmulti_attr_get( - struct inode *inode, - unsigned char *name, - unsigned char __user *ubuf, - uint32_t *len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_set( - struct inode *inode, - unsigned char *name, - const unsigned char __user *ubuf, - uint32_t len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_remove( - struct inode *inode, - unsigned char *name, - uint32_t flags); +int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, + uint32_t opcode, void __user *uname, void __user *value, + uint32_t *len, uint32_t flags); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, + int flags, struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( @@ -70,10 +59,6 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); -struct xfs_ibulk; -struct xfs_bstat; -struct xfs_inogrp; - int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 769581a79c58..c1771e728117 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -352,56 +352,24 @@ xfs_compat_handlereq_to_dentry( STATIC int xfs_compat_attrlist_by_handle( struct file *parfilp, - void __user *arg) + compat_xfs_fsop_attrlist_handlereq_t __user *p) { - int error; - attrlist_cursor_kern_t *cursor; - compat_xfs_fsop_attrlist_handlereq_t __user *p = arg; compat_xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; - char *kbuf; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&al_hreq, arg, - sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -EINVAL; dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; - - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); - if (error) - goto out_kfree; - - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { - error = -EFAULT; - goto out_kfree; - } - - if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) - error = -EFAULT; - -out_kfree: - kmem_free(kbuf); -out_dput: + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), + compat_ptr(al_hreq.buffer), al_hreq.buflen, + al_hreq.flags, &p->pos); dput(dentry); return error; } @@ -416,7 +384,6 @@ xfs_compat_attrmulti_by_handle( compat_xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -443,64 +410,18 @@ xfs_compat_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - if ((ops[i].am_flags & ATTR_ROOT) && - (ops[i].am_flags & ATTR_SECURE)) { - ops[i].am_error = -EINVAL; - continue; - } - ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; - - ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, compat_ptr(ops[i].am_attrname), - MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - &ops[i].am_length, ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - ops[i].am_length, ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index bb590a267a7f..3abb8b9d6f4c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -293,11 +293,11 @@ out_trans_cancel: STATIC bool xfs_quota_need_throttle( - struct xfs_inode *ip, - int type, - xfs_fsblock_t alloc_blocks) + struct xfs_inode *ip, + xfs_dqtype_t type, + xfs_fsblock_t alloc_blocks) { - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); if (!dq || !xfs_this_quota_on(ip->i_mount, type)) return false; @@ -307,7 +307,7 @@ xfs_quota_need_throttle( return false; /* under the lo watermark, no throttle */ - if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) return false; return true; @@ -315,24 +315,24 @@ xfs_quota_need_throttle( STATIC void xfs_quota_calc_throttle( - struct xfs_inode *ip, - int type, - xfs_fsblock_t *qblocks, - int *qshift, - int64_t *qfreesp) + struct xfs_inode *ip, + xfs_dqtype_t type, + xfs_fsblock_t *qblocks, + int *qshift, + int64_t *qfreesp) { - int64_t freesp; - int shift = 0; - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + int64_t freesp; + int shift = 0; /* no dq, or over hi wmark, squash the prealloc completely */ - if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; } - freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { shift = 2; if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) @@ -352,22 +352,10 @@ xfs_quota_calc_throttle( } /* - * If we are doing a write at the end of the file and there are no allocations - * past this one, then extend the allocation out to the file system's write - * iosize. - * * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the - * filesystem is to full, the smaller the maximum prealocation. - * - * As an exception we don't do any preallocation at all if the file is smaller - * than the minimum preallocation and we are using the default dynamic - * preallocation scheme, as it is likely this is the only write to the file that - * is going to be done. - * - * We clean up any extra space left over when the file is closed in - * xfs_inactive(). + * filesystem is to being full, the smaller the maximum preallocation. */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( @@ -377,63 +365,70 @@ xfs_iomap_prealloc_size( loff_t count, struct xfs_iext_cursor *icur) { + struct xfs_iext_cursor ncur = *icur; + struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - struct xfs_bmbt_irec prev; - int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; - int qshift = 0; xfs_fsblock_t alloc_blocks = 0; + xfs_extlen_t plen; + int shift = 0; + int qshift = 0; - if (offset + count <= XFS_ISIZE(ip)) - return 0; - - if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) && - (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))) + /* + * As an exception we don't do any preallocation at all if the file is + * smaller than the minimum preallocation and we are using the default + * dynamic preallocation scheme, as it is likely this is the only write + * to the file that is going to be done. + */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)) return 0; /* - * If an explicit allocsize is set, the file is small, or we - * are writing behind a hole, then use the minimum prealloc: + * Use the minimum preallocation size for small files or if we are + * writing right after a hole. */ - if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) || - XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - !xfs_iext_peek_prev_extent(ifp, icur, &prev) || + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || + !xfs_iext_prev_extent(ifp, &ncur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_allocsize_blocks; /* - * Determine the initial size of the preallocation. We are beyond the - * current EOF here, but we need to take into account whether this is - * a sparse write or an extending write when determining the - * preallocation size. Hence we need to look up the extent that ends - * at the current write offset and use the result to determine the - * preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceding data extent as the basis - * for the preallocation size. If the size of the extent is greater than - * half the maximum extent length, then use the current offset as the - * basis. This ensures that for large files the preallocation size - * always extends to MAXEXTLEN rather than falling short due to things - * like stripe unit/width alignment of real extents. + * Take the size of the preceding data extents as the basis for the + * preallocation size. Note that we don't care if the previous extents + * are written or not. */ - if (prev.br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev.br_blockcount << 1; - else + plen = prev.br_blockcount; + while (xfs_iext_prev_extent(ifp, &ncur, &got)) { + if (plen > MAXEXTLEN / 2 || + isnullstartblock(got.br_startblock) || + got.br_startoff + got.br_blockcount != prev.br_startoff || + got.br_startblock + got.br_blockcount != prev.br_startblock) + break; + plen += got.br_blockcount; + prev = got; + } + + /* + * If the size of the extents is greater than half the maximum extent + * length, then use the current offset as the basis. This ensures that + * for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width + * alignment of real extents. + */ + alloc_blocks = plen * 2; + if (alloc_blocks > MAXEXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); - if (!alloc_blocks) - goto check_writeio; qblocks = alloc_blocks; /* * MAXEXTLEN is not a power of two value but we round the prealloc down * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported prealloc - * size, we round up first, apply appropriate throttling, round down and - * cap the value to MAXEXTLEN. + * round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, + * round down and cap the value to MAXEXTLEN. */ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); @@ -455,14 +450,14 @@ xfs_iomap_prealloc_size( * Check each quota to cap the prealloc size, provide a shift value to * throttle with and adjust amount of available space. */ - if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift, &freesp); - if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift, &freesp); - if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift, &freesp); /* @@ -494,7 +489,6 @@ xfs_iomap_prealloc_size( */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; -check_writeio: if (alloc_blocks < mp->m_allocsize_blocks) alloc_blocks = mp->m_allocsize_blocks; trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, @@ -563,7 +557,7 @@ xfs_iomap_write_unwritten( xfs_trans_ijoin(tp, ip, 0); error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); if (error) goto error_on_bmapi_transaction; @@ -856,7 +850,7 @@ xfs_buffered_write_iomap_begin( xfs_ilock(ip, XFS_ILOCK_EXCL); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { error = -EFSCORRUPTED; goto out_unlock; @@ -871,7 +865,7 @@ xfs_buffered_write_iomap_begin( } /* - * Search the data fork fork first to look up our source mapping. We + * Search the data fork first to look up our source mapping. We * always need the data fork map, as we have to return it to the * iomap code so that the higher level write code can read data in to * perform read-modify-write cycles for unaligned writes. @@ -961,9 +955,16 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, - count, &icur); + if (eof && offset + count > XFS_ISIZE(ip)) { + /* + * Determine the initial size of the preallocation. + * We clean up any extra preallocation when the file is closed. + */ + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + prealloc_blocks = mp->m_allocsize_blocks; + else + prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, + offset, count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -1258,12 +1259,12 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) { error = -ENOENT; goto out_unlock; } - ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 81f2f93caec0..80a13c8561d8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -22,17 +22,17 @@ #include "xfs_iomap.h" #include "xfs_error.h" -#include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/iversion.h> +#include <linux/fiemap.h> /* - * Directories have different lock order w.r.t. mmap_sem compared to regular + * Directories have different lock order w.r.t. mmap_lock compared to regular * files. This is due to readdir potentially triggering page faults on a user * buffer inside filldir(), and this happens with the ilock on the directory * held. For regular files, the lock order is the other way around - the - * mmap_sem is taken during the page fault, and then we lock the ilock to do + * mmap_lock is taken during the page fault, and then we lock the ilock to do * block mapping. Hence we need a different class for the directory ilock so * that lockdep can tell them apart. */ @@ -50,10 +50,15 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, - strlen(xattr->name), - xattr->value, xattr->value_len, - ATTR_SECURE); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_SECURE, + .name = xattr->name, + .namelen = strlen(xattr->name), + .value = xattr->value, + .valuelen = xattr->value_len, + }; + error = xfs_attr_set(&args); if (error < 0) break; } @@ -553,7 +558,7 @@ xfs_vn_getattr( stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = ip->i_d.di_crtime; @@ -692,9 +697,7 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), - xfs_kgid_to_gid(gid), - ip->i_d.di_projid, + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -736,12 +739,7 @@ xfs_setattr_nonsize( if (error) /* out of quota */ goto out_cancel; } - } - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * @@ -763,7 +761,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } if (!gid_eq(igid, gid)) { @@ -775,7 +772,6 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } @@ -877,7 +873,7 @@ xfs_setattr_size( /* * Short circuit the truncate case for zero length files. */ - if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; @@ -1243,13 +1239,12 @@ xfs_inode_supports_dax( { struct xfs_mount *mp = ip->i_mount; - /* Only supported on non-reflinked files. */ - if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip)) + /* Only supported on regular files. */ + if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* DAX mount option or DAX iflag must be set. */ - if (!(mp->m_flags & XFS_MOUNT_DAX) && - !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + /* Only supported on non-reflinked files. */ + if (xfs_is_reflink_inode(ip)) return false; /* Block size must match page size */ @@ -1260,26 +1255,51 @@ xfs_inode_supports_dax( return xfs_inode_buftarg(ip)->bt_daxdev != NULL; } -STATIC void +static bool +xfs_inode_should_enable_dax( + struct xfs_inode *ip) +{ + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER) + return false; + if (!xfs_inode_supports_dax(ip)) + return false; + if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS) + return true; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + return true; + return false; +} + +void xfs_diflags_to_iflags( - struct inode *inode, - struct xfs_inode *ip) + struct xfs_inode *ip, + bool init) { - uint16_t flags = ip->i_d.di_flags; - - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | - S_NOATIME | S_DAX); - - if (flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - if (flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - if (flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - if (flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - if (xfs_inode_supports_dax(ip)) - inode->i_flags |= S_DAX; + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + unsigned int flags = 0; + + ASSERT(!(IS_DAX(inode) && init)); + + if (xflags & FS_XFLAG_IMMUTABLE) + flags |= S_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + flags |= S_APPEND; + if (xflags & FS_XFLAG_SYNC) + flags |= S_SYNC; + if (xflags & FS_XFLAG_NOATIME) + flags |= S_NOATIME; + if (init && xfs_inode_should_enable_dax(ip)) + flags |= S_DAX; + + /* + * S_DAX can only be set during inode initialization and is never set by + * the VFS, so we cannot mask off S_DAX in i_flags. + */ + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); + inode->i_flags |= flags; } /* @@ -1304,11 +1324,8 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ inode_fake_hash(inode); - inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); - inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - i_size_write(inode, ip->i_d.di_size); - xfs_diflags_to_iflags(inode, ip); + xfs_diflags_to_iflags(ip, true); if (S_ISDIR(inode->i_mode)) { /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 4b31c29b7e6b..16ca97a7ff00 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -86,8 +86,8 @@ xfs_bulkstat_one_int( */ buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = dic->di_uid; - buf->bs_gid = dic->di_gid; + buf->bs_uid = i_uid_read(inode); + buf->bs_gid = i_gid_read(inode); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -104,18 +104,18 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = dic->di_extsize; - buf->bs_extents = dic->di_nextents; + buf->bs_extents = xfs_ifork_nextents(&ip->i_df); xfs_bulkstat_health(ip, buf); - buf->bs_aextents = dic->di_anextents; + buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (dic->di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) buf->bs_cowextsize_blks = dic->di_cowextsize; } - switch (dic->di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = sysv_encode_dev(inode->i_rdev); buf->bs_blksize = BLKDEV_IOSIZE; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 8738bb03f253..ab737fed7b12 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -60,6 +60,7 @@ typedef __u32 xfs_nlink_t; #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> +#include <linux/xattr.h> #include <asm/page.h> #include <asm/div64.h> @@ -101,12 +102,8 @@ typedef __u32 xfs_nlink_t; #define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) -#define current_pid() (current->pid) -#define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) -#define current_clear_flags_nested(sp, f) \ - (*(sp) = current->flags, current->flags &= ~(f)) #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) @@ -163,32 +160,6 @@ struct xstats { extern struct xstats xfsstats; -/* Kernel uid/gid conversion. These are used to convert to/from the on disk - * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. - * The conversion here is type only, the value will remain the same since we - * are converting to the init_user_ns. The uid is later mapped to a particular - * user namespace value when crossing the kernel/user boundary. - */ -static inline uint32_t xfs_kuid_to_uid(kuid_t uid) -{ - return from_kuid(&init_user_ns, uid); -} - -static inline kuid_t xfs_uid_to_kuid(uint32_t uid) -{ - return make_kuid(&init_user_ns, uid); -} - -static inline uint32_t xfs_kgid_to_gid(kgid_t gid) -{ - return from_kgid(&init_user_ns, gid); -} - -static inline kgid_t xfs_gid_to_kgid(uint32_t gid) -{ - return make_kgid(&init_user_ns, gid); -} - static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) { return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6006d94a581..ad0c69ee8947 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,13 +24,6 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -47,8 +40,7 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted); + struct xlog_in_core *iclog); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -63,23 +55,10 @@ xlog_state_switch_iclogs( struct xlog_in_core *iclog, int eventual_size); STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog); - -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog); @@ -454,7 +433,7 @@ xfs_log_reserve( XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -484,73 +463,6 @@ out_error: return error; } - -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ - -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) -{ - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } - - - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; - } - - xfs_log_ticket_put(ticket); - return lsn; -} - static bool __xlog_state_release_iclog( struct xlog *log, @@ -597,26 +509,21 @@ xlog_state_release_iclog( return 0; } -int +void xfs_log_release_iclog( - struct xfs_mount *mp, struct xlog_in_core *iclog) { - struct xlog *log = mp->m_log; - bool sync; - - if (iclog->ic_state == XLOG_STATE_IOERROR) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return -EIO; - } + struct xlog *log = iclog->ic_log; + bool sync = false; if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - sync = __xlog_state_release_iclog(log, iclog); + if (iclog->ic_state != XLOG_STATE_IOERROR) + sync = __xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); - if (sync) - xlog_sync(log, iclog); } - return 0; + + if (sync) + xlog_sync(log, iclog); } /* @@ -855,32 +762,69 @@ xfs_log_mount_cancel( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Wait for the iclog to be written disk, or return an error if the log has been + * shut down. */ +static int +xlog_wait_on_iclog( + struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock) +{ + struct xlog *log = iclog->ic_log; -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) + if (!XLOG_FORCED_SHUTDOWN(log) && + iclog->ic_state != XLOG_STATE_ACTIVE && + iclog->ic_state != XLOG_STATE_DIRTY) { + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + return 0; +} + +/* + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. + */ +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -891,23 +835,7 @@ xfs_log_write_unmount_record( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -919,28 +847,32 @@ out_err: spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - default: - if (!XLOG_FORCED_SHUTDOWN(log)) { - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } - /* fall through */ - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - spin_unlock(&log->l_icloglock); - break; - } + xlog_wait_on_iclog(iclog); if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } +static void +xfs_log_unmount_verify_iclog( + struct xlog *log) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } while ((iclog = iclog->ic_next) != log->l_iclog); +} + /* * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). @@ -948,16 +880,11 @@ out_err: * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ - -static int -xfs_log_unmount_write(xfs_mount_t *mp) +static void +xfs_log_unmount_write( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - xlog_in_core_t *iclog; -#ifdef DEBUG - xlog_in_core_t *first_iclog; -#endif - int error; + struct xlog *log = mp->m_log; /* * Don't write out unmount record on norecovery mounts or ro devices. @@ -966,57 +893,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_NORECOVERY || xfs_readonly_buftarg(log->l_targ)) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); - return 0; + return; } - error = xfs_log_force(mp, XFS_LOG_SYNC); - ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + xfs_log_force(mp, XFS_LOG_SYNC); -#ifdef DEBUG - first_iclog = iclog = log->l_iclog; - do { - if (iclog->ic_state != XLOG_STATE_IOERROR) { - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); - ASSERT(iclog->ic_offset == 0); - } - iclog = iclog->ic_next; - } while (iclog != first_iclog); -#endif - if (! (XLOG_FORCED_SHUTDOWN(log))) { - xfs_log_write_unmount_record(mp); - } else { - /* - * We're already in forced_shutdown mode, couldn't - * even attempt to write out the unmount transaction. - * - * Go through the motions of sync'ing and releasing - * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/Os that may already - * be in progress. Do this as a separate section of - * code so we'll know if we ever get stuck here that - * we're in this odd situation of trying to unmount - * a file system that went into forced_shutdown as - * the result of an unmount.. - */ - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - case XLOG_STATE_IOERROR: - spin_unlock(&log->l_icloglock); - break; - default: - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } + if (XLOG_FORCED_SHUTDOWN(log)) + return; + + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; } - return error; -} /* xfs_log_unmount_write */ + xfs_log_unmount_verify_iclog(log); + xlog_unmount_write(log); +} /* * Empty the log for unmount/freeze. @@ -1279,7 +1179,6 @@ xlog_ioend_work( struct xlog_in_core *iclog = container_of(work, struct xlog_in_core, ic_end_io_work); struct xlog *log = iclog->ic_log; - bool aborted = false; int error; error = blk_status_to_errno(iclog->ic_bio.bi_status); @@ -1295,17 +1194,9 @@ xlog_ioend_work( if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - /* - * This flag will be propagated to the trans-committed - * callback routines to let them know that the log-commit - * didn't succeed. - */ - aborted = true; - } else if (iclog->ic_state == XLOG_STATE_IOERROR) { - aborted = true; } - xlog_state_done_syncing(iclog, aborted); + xlog_state_done_syncing(iclog); bio_uninit(&iclog->ic_bio); /* @@ -1551,20 +1442,17 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1574,12 +1462,15 @@ xlog_commit_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -1739,7 +1630,7 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static void +static int xlog_map_iclog_data( struct bio *bio, void *data, @@ -1750,11 +1641,14 @@ xlog_map_iclog_data( unsigned int off = offset_in_page(data); size_t len = min_t(size_t, count, PAGE_SIZE - off); - WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); + if (bio_add_page(bio, page, len, off) != len) + return -EIO; data += len; count -= len; } while (count); + + return 0; } STATIC void @@ -1784,7 +1678,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog, true); + xlog_state_done_syncing(iclog); up(&iclog->ic_sema); return; } @@ -1794,11 +1688,22 @@ xlog_write_iclog( iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; - xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count); + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return; + } if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -2011,7 +1916,7 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. @@ -2148,23 +2053,21 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2186,27 +2089,16 @@ xlog_write_calc_vec_length( return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2328,7 +2220,11 @@ xlog_write_copy_finish( *record_cnt = 0; *data_cnt = 0; - xlog_state_want_sync(log, iclog); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); if (!commit_iclog) goto release_iclog; spin_unlock(&log->l_icloglock); @@ -2391,13 +2287,14 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; @@ -2405,25 +2302,13 @@ xlog_write( int data_cnt = 0; int error = 0; - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2431,9 +2316,8 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2457,7 +2341,6 @@ xlog_write( while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2473,11 +2356,15 @@ xlog_write( ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); @@ -2509,8 +2396,13 @@ xlog_write( xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); + copy_len += sizeof(struct xlog_op_header); record_cnt++; + if (need_start_rec) { + copy_len += sizeof(struct xlog_op_header); + record_cnt++; + need_start_rec = false; + } data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, flags, @@ -2567,119 +2459,106 @@ next_lv: return error; } +static void +xlog_state_activate_iclog( + struct xlog_in_core *iclog, + int *iclogs_changed) +{ + ASSERT(list_empty_careful(&iclog->ic_callbacks)); -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ + /* + * If the number of ops in this iclog indicate it just contains the + * dummy transaction, we can change state into IDLE (the second time + * around). Otherwise we should change the state into NEED a dummy. + * We don't need to cover the dummy. + */ + if (*iclogs_changed == 0 && + iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + *iclogs_changed = 1; + } else { + /* + * We have two dirty iclogs so start over. This could also be + * num of ops indicating this is not the dummy going out. + */ + *iclogs_changed = 2; + } + + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; +} /* - * An iclog has just finished IO completion processing, so we need to update - * the iclog state and propagate that up into the overall log state. Hence we - * prepare the iclog for cleaning, and then clean all the pending dirty iclogs - * starting from the head, and then wake up any threads that are waiting for the - * iclog to be marked clean. - * - * The ordering of marking iclogs ACTIVE must be maintained, so an iclog - * doesn't become ACTIVE beyond one that is SYNCING. This is also required to - * maintain the notion that we use a ordered wait queue to hold off would be - * writers to the log when every iclog is trying to sync to disk. - * - * Caller must hold the icloglock before calling us. - * - * State Change: !IOERROR -> DIRTY -> ACTIVE + * Loop through all iclogs and mark all iclogs currently marked DIRTY as + * ACTIVE after iclog I/O has completed. */ -STATIC void -xlog_state_clean_iclog( +static void +xlog_state_activate_iclogs( struct xlog *log, - struct xlog_in_core *dirty_iclog) + int *iclogs_changed) { - struct xlog_in_core *iclog; - int changed = 0; - - /* Prepare the completed iclog. */ - if (dirty_iclog->ic_state != XLOG_STATE_IOERROR) - dirty_iclog->ic_state = XLOG_STATE_DIRTY; + struct xlog_in_core *iclog = log->l_iclog; - /* Walk all the iclogs to update the ordered active state. */ - iclog = log->l_iclog; do { - if (iclog->ic_state == XLOG_STATE_DIRTY) { - iclog->ic_state = XLOG_STATE_ACTIVE; - iclog->ic_offset = 0; - ASSERT(list_empty_careful(&iclog->ic_callbacks)); - /* - * If the number of ops in this iclog indicate it just - * contains the dummy transaction, we can - * change state into IDLE (the second time around). - * Otherwise we should change the state into - * NEED a dummy. - * We don't need to cover the dummy. - */ - if (!changed && - (be32_to_cpu(iclog->ic_header.h_num_logops) == - XLOG_COVER_OPS)) { - changed = 1; - } else { - /* - * We have two dirty iclogs so start over - * This could also be num of ops indicates - * this is not the dummy going out. - */ - changed = 2; - } - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - } else if (iclog->ic_state == XLOG_STATE_ACTIVE) - /* do nothing */; - else - break; /* stop cleaning */ - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - + if (iclog->ic_state == XLOG_STATE_DIRTY) + xlog_state_activate_iclog(iclog, iclogs_changed); + /* + * The ordering of marking iclogs ACTIVE must be maintained, so + * an iclog doesn't become ACTIVE beyond one that is SYNCING. + */ + else if (iclog->ic_state != XLOG_STATE_ACTIVE) + break; + } while ((iclog = iclog->ic_next) != log->l_iclog); +} +static int +xlog_covered_state( + int prev_state, + int iclogs_changed) +{ /* - * Wake up threads waiting in xfs_log_force() for the dirty iclog - * to be cleaned. + * We usually go to NEED. But we go to NEED2 if the changed indicates we + * are done writing the dummy record. If we are done with the second + * dummy recored (DONE2), then we go to IDLE. */ - wake_up_all(&dirty_iclog->ic_force_wait); + switch (prev_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + break; + case XLOG_STATE_COVER_DONE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_NEED2; + break; + case XLOG_STATE_COVER_DONE2: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + break; + default: + ASSERT(0); + } - /* - * Change state for the dummy log recording. - * We usually go to NEED. But we go to NEED2 if the changed indicates - * we are done writing the dummy record. - * If we are done with the second dummy recored (DONE2), then - * we go to IDLE. - */ - if (changed) { - switch (log->l_covered_state) { - case XLOG_STATE_COVER_IDLE: - case XLOG_STATE_COVER_NEED: - case XLOG_STATE_COVER_NEED2: - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + return XLOG_STATE_COVER_NEED; +} - case XLOG_STATE_COVER_DONE: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_NEED2; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; +STATIC void +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) +{ + int iclogs_changed = 0; - case XLOG_STATE_COVER_DONE2: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_IDLE; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + dirty_iclog->ic_state = XLOG_STATE_DIRTY; - default: - ASSERT(0); - } + xlog_state_activate_iclogs(log, &iclogs_changed); + wake_up_all(&dirty_iclog->ic_force_wait); + + if (iclogs_changed) { + log->l_covered_state = xlog_covered_state(log->l_covered_state, + iclogs_changed); } } @@ -2808,8 +2687,7 @@ xlog_state_iodone_process_iclog( static void xlog_state_do_iclog_callbacks( struct xlog *log, - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) __releases(&log->l_icloglock) __acquires(&log->l_icloglock) { @@ -2821,7 +2699,7 @@ xlog_state_do_iclog_callbacks( list_splice_init(&iclog->ic_callbacks, &tmp); spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); + xlog_cil_process_committed(&tmp); spin_lock(&iclog->ic_callback_lock); } @@ -2836,8 +2714,7 @@ xlog_state_do_iclog_callbacks( STATIC void xlog_state_do_callback( - struct xlog *log, - bool aborted) + struct xlog *log) { struct xlog_in_core *iclog; struct xlog_in_core *first_iclog; @@ -2878,9 +2755,11 @@ xlog_state_do_callback( * we'll have to run at least one more complete loop. */ cycled_icloglock = true; - xlog_state_do_iclog_callbacks(log, iclog, aborted); - - xlog_state_clean_iclog(log, iclog); + xlog_state_do_iclog_callbacks(log, iclog); + if (XLOG_FORCED_SHUTDOWN(log)) + wake_up_all(&iclog->ic_force_wait); + else + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2916,25 +2795,22 @@ xlog_state_do_callback( */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) { struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); - ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* * If we got an error, either on the first buffer, or in the case of - * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, - * and none should ever be attempted to be written to disk - * again. + * split log writes, on the second, we shut down the file system and + * no iclogs should ever be attempted to be written to disk again. */ - if (iclog->ic_state == XLOG_STATE_SYNCING) + if (!XLOG_FORCED_SHUTDOWN(log)) { + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; - else - ASSERT(iclog->ic_state == XLOG_STATE_IOERROR); + } /* * Someone could be sleeping prior to writing out the next @@ -2943,9 +2819,8 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted); /* also cleans log */ -} /* xlog_state_done_syncing */ - + xlog_state_do_callback(log); +} /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must @@ -3064,21 +2939,21 @@ restart: *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ - -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +} + +/* + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3090,21 +2965,20 @@ xlog_regrant_reserve_log_space( ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; - - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); - - trace_xfs_log_regrant_reserve_exit(log, ticket); + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + } + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3120,18 +2994,19 @@ xlog_regrant_reserve_log_space( * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3146,17 +3021,15 @@ xlog_ungrant_log_space( xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* - * This routine will mark the current iclog in the ring as WANT_SYNC - * and move the current iclog pointer to the next iclog in the ring. - * When this routine is called from xlog_state_get_iclog_space(), the - * exact size of the iclog has not yet been determined. All we know is - * that every data block. We have run out of space in this log record. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ STATIC void xlog_state_switch_iclogs( @@ -3165,6 +3038,8 @@ xlog_state_switch_iclogs( int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + assert_spin_locked(&log->l_icloglock); + if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; @@ -3199,7 +3074,7 @@ xlog_state_switch_iclogs( } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3259,9 +3134,6 @@ xfs_log_force( * previous iclog and go to sleep. */ iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { /* @@ -3277,8 +3149,7 @@ xfs_log_force( if (xlog_state_release_iclog(log, iclog)) goto out_error; - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || - iclog->ic_state == XLOG_STATE_DIRTY) + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) goto out_unlock; } else { /* @@ -3298,17 +3169,8 @@ xfs_log_force( ; } - if (!(flags & XFS_LOG_SYNC)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3339,9 +3201,6 @@ __xfs_log_force_lsn( goto out_unlock; } - if (iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; - if (iclog->ic_state == XLOG_STATE_ACTIVE) { /* * We sleep here if we haven't already slept (e.g. this is the @@ -3375,20 +3234,8 @@ __xfs_log_force_lsn( *log_flushed = 1; } - if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3435,33 +3282,6 @@ xfs_log_force_lsn( } /* - * Called when we want to mark the current iclog as being ready to sync to - * disk. - */ -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog) -{ - assert_spin_locked(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - xlog_state_switch_iclogs(log, iclog, 0); - } else { - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - } -} - - -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ - -/* * Free a used ticket when its refcount falls to zero. */ void @@ -3588,15 +3408,12 @@ xlog_ticket_alloc( int unit_bytes, int cnt, char client, - bool permanent, - xfs_km_flags_t alloc_flags) + bool permanent) { struct xlog_ticket *tic; int unit_res; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); @@ -3609,7 +3426,6 @@ xlog_ticket_alloc( tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; @@ -3618,13 +3434,6 @@ xlog_ticket_alloc( return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of @@ -3710,7 +3519,7 @@ xlog_verify_tail_lsn( if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3813,7 +3622,7 @@ xlog_verify_iclog( } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ +} #endif /* @@ -3937,7 +3746,7 @@ xfs_log_force_umount( spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log, true); + xlog_state_do_callback(log); /* return non-zero if log IOERROR transition had already happened */ return retval; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e06805160f..1412d6993f1e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -105,10 +105,6 @@ struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant); int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); @@ -121,8 +117,7 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_release_iclog(struct xfs_mount *mp, - struct xlog_in_core *iclog); +void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, @@ -138,7 +133,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, bool regrant); -void xlog_cil_process_committed(struct list_head *list, bool aborted); +void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 48435cf2aa16..b0ef071b3cb5 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,8 +37,7 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, - KM_NOFS); + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); /* * set the current reservation to zero so we know to steal the basic @@ -240,7 +239,7 @@ xfs_cil_prepare_item( * this CIL context and so we need to pin it. If we are replacing the * old_lv, then remove the space it accounts for and make it the shadow * buffer for later freeing. In both cases we are now switching to the - * shadow buffer, so update the the pointer to it appropriately. + * shadow buffer, so update the pointer to it appropriately. */ if (!old_lv) { if (lv->lv_item->li_ops->iop_pin) @@ -574,10 +573,10 @@ xlog_discard_busy_extents( */ static void xlog_cil_committed( - struct xfs_cil_ctx *ctx, - bool abort) + struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. @@ -613,37 +612,38 @@ xlog_cil_committed( void xlog_cil_process_committed( - struct list_head *list, - bool aborted) + struct list_head *list) { struct xfs_cil_ctx *ctx; while ((ctx = list_first_entry_or_null(list, struct xfs_cil_ctx, iclog_entry))) { list_del(&ctx->iclog_entry); - xlog_cil_committed(ctx, aborted); + xlog_cil_committed(ctx); } } /* - * Push the Committed Item List to the log. If @push_seq flag is zero, then it - * is a background flush and so we can chose to ignore it. Otherwise, if the - * current sequence is the same as @push_seq we need to do a flush. If - * @push_seq is less than the current sequence, then it has already been + * Push the Committed Item List to the log. + * + * If the current sequence is the same as xc_push_seq we need to do a flush. If + * xc_push_seq is less than the current sequence, then it has already been * flushed and we don't need to do anything - the caller will wait for it to * complete if necessary. * - * @push_seq is a value rather than a flag because that allows us to do an - * unlocked check of the sequence number for a match. Hence we can allows log - * forces to run racily and not issue pushes for the same sequence twice. If we - * get a race between multiple pushes for the same sequence they will block on - * the first one and then abort, hence avoiding needless pushes. + * xc_push_seq is checked unlocked against the sequence number for a match. + * Hence we can allow log forces to run racily and not issue pushes for the + * same sequence twice. If we get a race between multiple pushes for the same + * sequence they will block on the first one and then abort, hence avoiding + * needless pushes. */ -STATIC int -xlog_cil_push( - struct xlog *log) +static void +xlog_cil_push_work( + struct work_struct *work) { - struct xfs_cil *cil = log->l_cilp; + struct xfs_cil *cil = + container_of(work, struct xfs_cil, xc_push_work); + struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; @@ -657,9 +657,6 @@ xlog_cil_push( xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; - if (!cil) - return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -671,6 +668,12 @@ xlog_cil_push( ASSERT(push_seq <= ctx->sequence); /* + * Wake up any background push waiters now this context is being pushed. + */ + if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + wake_up_all(&cil->xc_push_wait); + + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. @@ -740,7 +743,7 @@ xlog_cil_push( /* * initialise the new context and attach it to the CIL. Then attach - * the current context to the CIL committing lsit so it can be found + * the current context to the CIL committing list so it can be found * during log forces to extract the commit lsn of the sequence that * needs to be forced. */ @@ -803,7 +806,7 @@ xlog_cil_push( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket; @@ -841,10 +844,11 @@ restart: } spin_unlock(&cil->xc_push_lock); - /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); - if (commit_lsn == -1) - goto out_abort; + error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); + if (error) + goto out_abort_free_ticket; + + xfs_log_ticket_ungrant(log, tic); spin_lock(&commit_iclog->ic_callback_lock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { @@ -867,28 +871,20 @@ restart: spin_unlock(&cil->xc_push_lock); /* release the hounds! */ - return xfs_log_release_iclog(log->l_mp, commit_iclog); + xfs_log_release_iclog(commit_iclog); + return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); - return 0; + return; out_abort_free_ticket: - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); out_abort: - xlog_cil_committed(ctx, true); - return -EIO; -} - -static void -xlog_cil_push_work( - struct work_struct *work) -{ - struct xfs_cil *cil = container_of(work, struct xfs_cil, - xc_push_work); - xlog_cil_push(cil->xc_log); + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + xlog_cil_committed(ctx); } /* @@ -900,7 +896,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct xlog *log) + struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; @@ -914,14 +910,36 @@ xlog_cil_push_background( * don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + up_read(&cil->xc_ctx_lock); return; + } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } + + /* + * Drop the context lock now, we can't hold that if we need to sleep + * because we are over the blocking threshold. The push_lock is still + * held, so blocking threshold sleep/wakeup is still correctly + * serialised here. + */ + up_read(&cil->xc_ctx_lock); + + /* + * If we are well over the space limit, throttle the work that is being + * done until the push work on this context has begun. + */ + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); + ASSERT(cil->xc_ctx->space_used < log->l_logsize); + xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); + return; + } + spin_unlock(&cil->xc_push_lock); } @@ -1017,7 +1035,10 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = xc_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, regrant); + if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); + else + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); @@ -1038,9 +1059,9 @@ xfs_log_commit_cil( if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, xc_commit_lsn); } - xlog_cil_push_background(log); - up_read(&cil->xc_ctx_lock); + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); } /* @@ -1194,6 +1215,7 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); + init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b192c5a9f9fd..1c6fdbf3d506 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,13 +51,11 @@ enum xlog_iclog_state { }; /* - * Flags to log ticket + * Log ticket flags */ -#define XLOG_TIC_INITED 0x1 /* has been initialized */ -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ #define XLOG_TIC_FLAGS \ - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* @@ -275,6 +273,7 @@ struct xfs_cil { wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; struct work_struct xc_push_work; + wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; /* @@ -318,13 +317,53 @@ struct xfs_cil { * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) + +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines @@ -402,7 +441,8 @@ struct xlog { #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) +#define XLOG_FORCED_SHUTDOWN(log) \ + (unlikely((log)->l_flags & XLOG_IO_ERROR)) /* common routines */ extern int @@ -424,9 +464,7 @@ xlog_ticket_alloc( int unit_bytes, int count, char client, - bool permanent, - xfs_km_flags_t alloc_flags); - + bool permanent); static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) @@ -438,14 +476,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int -xlog_write( - struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags); +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, uint flags, + bool need_start_rec); +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, + struct xlog_in_core **iclog, xfs_lsn_t *lsn); +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -525,12 +563,6 @@ xlog_cil_force(struct xlog *log) } /* - * Unmount record type is used as a pseudo transaction type for the ticket. - * It's value must be outside the range of XFS_TRANS_* values. - */ -#define XLOG_UNMOUNT_REC_TYPE (-1U) - -/* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 25cfc85dbaa7..e2ec91b2d0f4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -18,21 +18,13 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" -#include "xfs_inode_item.h" -#include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" -#include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_bmap_btree.h" #include "xfs_error.h" -#include "xfs_dir2.h" -#include "xfs_rmap_item.h" #include "xfs_buf_item.h" -#include "xfs_refcount_item.h" -#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -56,17 +48,6 @@ xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* - * This structure is used during recovery to record the buf log items which - * have been canceled and should not be replayed. - */ -struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; - uint bc_len; - int bc_refcount; - struct list_head bc_list; -}; - -/* * Sector aligned buffer routines for buffer create/read/write/access */ @@ -284,7 +265,7 @@ xlog_header_check_mount( return 0; } -STATIC void +void xlog_recover_iodone( struct xfs_buf *bp) { @@ -306,9 +287,8 @@ xlog_recover_iodone( if (bp->b_log_item) xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); - - bp->b_iodone = NULL; - xfs_buf_ioend(bp); + bp->b_flags &= ~_XBF_LOGRECOVERY; + xfs_buf_ioend_finish(bp); } /* @@ -1120,7 +1100,7 @@ xlog_verify_head( * * Note that xlog_find_tail() clears the blocks at the new head * (i.e., the records with invalid CRC) if the cycle number - * matches the the current cycle. + * matches the current cycle. */ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, buffer, rhead_blk, rhead, wrapped); @@ -1779,12 +1759,72 @@ xlog_clear_stale_blocks( return 0; } +/* + * Release the recovered intent item in the AIL that matches the given intent + * type and intent id. + */ +void +xlog_recover_release_intent( + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) +{ + struct xfs_ail_cursor cur; + struct xfs_log_item *lip; + struct xfs_ail *ailp = log->l_ailp; + + spin_lock(&ailp->ail_lock); + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + if (lip->li_type != intent_type) + continue; + if (!lip->li_ops->iop_match(lip, intent_id)) + continue; + + spin_unlock(&ailp->ail_lock); + lip->li_ops->iop_release(lip); + spin_lock(&ailp->ail_lock); + break; + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->ail_lock); +} + /****************************************************************************** * * Log recover routines * ****************************************************************************** */ +static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { + &xlog_buf_item_ops, + &xlog_inode_item_ops, + &xlog_dquot_item_ops, + &xlog_quotaoff_item_ops, + &xlog_icreate_item_ops, + &xlog_efi_item_ops, + &xlog_efd_item_ops, + &xlog_rui_item_ops, + &xlog_rud_item_ops, + &xlog_cui_item_ops, + &xlog_cud_item_ops, + &xlog_bui_item_ops, + &xlog_bud_item_ops, +}; + +static const struct xlog_recover_item_ops * +xlog_find_item_ops( + struct xlog_recover_item *item) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++) + if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type) + return xlog_recover_item_ops[i]; + + return NULL; +} /* * Sort the log items in the transaction. @@ -1841,54 +1881,23 @@ xlog_recover_reorder_trans( struct xlog_recover *trans, int pass) { - xlog_recover_item_t *item, *n; + struct xlog_recover_item *item, *n; int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); LIST_HEAD(inode_buffer_list); - LIST_HEAD(inode_list); + LIST_HEAD(item_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST; - switch (ITEM_TYPE(item)) { - case XFS_LI_ICREATE: - list_move_tail(&item->ri_list, &buffer_list); - break; - case XFS_LI_BUF: - if (buf_f->blf_flags & XFS_BLF_CANCEL) { - trace_xfs_log_recover_item_reorder_head(log, - trans, item, pass); - list_move(&item->ri_list, &cancel_list); - break; - } - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - list_move(&item->ri_list, &inode_buffer_list); - break; - } - list_move_tail(&item->ri_list, &buffer_list); - break; - case XFS_LI_INODE: - case XFS_LI_DQUOT: - case XFS_LI_QUOTAOFF: - case XFS_LI_EFD: - case XFS_LI_EFI: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - trace_xfs_log_recover_item_reorder_tail(log, - trans, item, pass); - list_move_tail(&item->ri_list, &inode_list); - break; - default: + item->ri_ops = xlog_find_item_ops(item); + if (!item->ri_ops) { xfs_warn(log->l_mp, - "%s: unrecognized type of log operation", - __func__); + "%s: unrecognized type of log operation (%d)", + __func__, ITEM_TYPE(item)); ASSERT(0); /* * return the remaining items back to the transaction @@ -1896,16 +1905,38 @@ xlog_recover_reorder_trans( */ if (!list_empty(&sort_list)) list_splice_init(&sort_list, &trans->r_itemq); - error = -EIO; - goto out; + error = -EFSCORRUPTED; + break; + } + + if (item->ri_ops->reorder) + fate = item->ri_ops->reorder(item); + + switch (fate) { + case XLOG_REORDER_BUFFER_LIST: + list_move_tail(&item->ri_list, &buffer_list); + break; + case XLOG_REORDER_CANCEL_LIST: + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); + break; + case XLOG_REORDER_INODE_BUFFER_LIST: + list_move(&item->ri_list, &inode_buffer_list); + break; + case XLOG_REORDER_ITEM_LIST: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &item_list); + break; } } -out: + ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); - if (!list_empty(&inode_list)) - list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&item_list)) + list_splice_tail(&item_list, &trans->r_itemq); if (!list_empty(&inode_buffer_list)) list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) @@ -1913,2152 +1944,15 @@ out: return error; } -/* - * Build up the table of buf cancel records so that we don't replay - * cancelled data in the second pass. For buffer records that are - * not cancel records, there is nothing to do here so we just return. - * - * If we get a cancel record which is already in the table, this indicates - * that the buffer was cancelled multiple times. In order to ensure - * that during pass 2 we keep the record in the table until we reach its - * last occurrence in the log, we keep a reference count in the cancel - * record in the table to tell us how many times we expect to see this - * record during the second pass. - */ -STATIC int -xlog_recover_buffer_pass1( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - struct list_head *bucket; - struct xfs_buf_cancel *bcp; - - if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { - xfs_err(log->l_mp, "bad buffer log item size (%d)", - item->ri_buf[0].i_len); - return -EFSCORRUPTED; - } - - /* - * If this isn't a cancel buffer item, then just return. - */ - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { - trace_xfs_log_recover_buf_not_cancel(log, buf_f); - return 0; - } - - /* - * Insert an xfs_buf_cancel record into the hash table of them. - * If there is already an identical record, bump its reference count. - */ - bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); - list_for_each_entry(bcp, bucket, bc_list) { - if (bcp->bc_blkno == buf_f->blf_blkno && - bcp->bc_len == buf_f->blf_len) { - bcp->bc_refcount++; - trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); - return 0; - } - } - - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); - bcp->bc_blkno = buf_f->blf_blkno; - bcp->bc_len = buf_f->blf_len; - bcp->bc_refcount = 1; - list_add_tail(&bcp->bc_list, bucket); - - trace_xfs_log_recover_buf_cancel_add(log, buf_f); - return 0; -} - -/* - * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it is, return the cancel - * buffer structure to the caller. - */ -STATIC struct xfs_buf_cancel * -xlog_peek_buffer_cancelled( +void +xlog_buf_readahead( struct xlog *log, xfs_daddr_t blkno, uint len, - unsigned short flags) + const struct xfs_buf_ops *ops) { - struct list_head *bucket; - struct xfs_buf_cancel *bcp; - - if (!log->l_buf_cancel_table) { - /* empty table means no cancelled buffers in the log */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return NULL; - } - - bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); - list_for_each_entry(bcp, bucket, bc_list) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) - return bcp; - } - - /* - * We didn't find a corresponding entry in the table, so return 0 so - * that the buffer is NOT cancelled. - */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return NULL; -} - -/* - * If the buffer is being cancelled then return 1 so that it will be cancelled, - * otherwise return 0. If the buffer is actually a buffer cancel item - * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the - * table and remove it from the table if this is the last reference. - * - * We remove the cancel record from the table when we encounter its last - * occurrence in the log so that if the same buffer is re-used again after its - * last cancellation we actually replay the changes made at that point. - */ -STATIC int -xlog_check_buffer_cancelled( - struct xlog *log, - xfs_daddr_t blkno, - uint len, - unsigned short flags) -{ - struct xfs_buf_cancel *bcp; - - bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); - if (!bcp) - return 0; - - /* - * We've go a match, so return 1 so that the recovery of this buffer - * is cancelled. If this buffer is actually a buffer cancel log - * item, then decrement the refcount on the one in the table and - * remove it if this is the last reference. - */ - if (flags & XFS_BLF_CANCEL) { - if (--bcp->bc_refcount == 0) { - list_del(&bcp->bc_list); - kmem_free(bcp); - } - } - return 1; -} - -/* - * Perform recovery for a buffer full of inodes. In these buffers, the only - * data which should be recovered is that which corresponds to the - * di_next_unlinked pointers in the on disk inode structures. The rest of the - * data for the inodes is always logged through the inodes themselves rather - * than the inode buffer and is recovered in xlog_recover_inode_pass2(). - * - * The only time when buffers full of inodes are fully recovered is when the - * buffer is full of newly allocated inodes. In this case the buffer will - * not be marked as an inode buffer and so will be sent to - * xlog_recover_do_reg_buffer() below during recovery. - */ -STATIC int -xlog_recover_do_inode_buffer( - struct xfs_mount *mp, - xlog_recover_item_t *item, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) -{ - int i; - int item_index = 0; - int bit = 0; - int nbits = 0; - int reg_buf_offset = 0; - int reg_buf_bytes = 0; - int next_unlinked_offset; - int inodes_per_buf; - xfs_agino_t *logged_nextp; - xfs_agino_t *buffer_nextp; - - trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - - /* - * Post recovery validation only works properly on CRC enabled - * filesystems. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - bp->b_ops = &xfs_inode_buf_ops; - - inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; - for (i = 0; i < inodes_per_buf; i++) { - next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + - offsetof(xfs_dinode_t, di_next_unlinked); - - while (next_unlinked_offset >= - (reg_buf_offset + reg_buf_bytes)) { - /* - * The next di_next_unlinked field is beyond - * the current logged region. Find the next - * logged region that contains or is beyond - * the current di_next_unlinked field. - */ - bit += nbits; - bit = xfs_next_bit(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - - /* - * If there are no more logged regions in the - * buffer, then we're done. - */ - if (bit == -1) - return 0; - - nbits = xfs_contig_bits(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLF_SHIFT; - reg_buf_bytes = nbits << XFS_BLF_SHIFT; - item_index++; - } - - /* - * If the current logged region starts after the current - * di_next_unlinked field, then move on to the next - * di_next_unlinked field. - */ - if (next_unlinked_offset < reg_buf_offset) - continue; - - ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); - - /* - * The current logged region contains a copy of the - * current di_next_unlinked field. Extract its value - * and copy it to the buffer copy. - */ - logged_nextp = item->ri_buf[item_index].i_addr + - next_unlinked_offset - reg_buf_offset; - if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { - xfs_alert(mp, - "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " - "Trying to replay bad (0) inode di_next_unlinked field.", - item, bp); - return -EFSCORRUPTED; - } - - buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); - *buffer_nextp = *logged_nextp; - - /* - * If necessary, recalculate the CRC in the on-disk inode. We - * have to leave the inode in a consistent state for whoever - * reads it next.... - */ - xfs_dinode_calc_crc(mp, - xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); - - } - - return 0; -} - -/* - * V5 filesystems know the age of the buffer on disk being recovered. We can - * have newer objects on disk than we are replaying, and so for these cases we - * don't want to replay the current change as that will make the buffer contents - * temporarily invalid on disk. - * - * The magic number might not match the buffer type we are going to recover - * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence - * extract the LSN of the existing object in the buffer based on it's current - * magic number. If we don't recognise the magic number in the buffer, then - * return a LSN of -1 so that the caller knows it was an unrecognised block and - * so can recover the buffer. - * - * Note: we cannot rely solely on magic number matches to determine that the - * buffer has a valid LSN - we also need to verify that it belongs to this - * filesystem, so we need to extract the object's LSN and compare it to that - * which we read from the superblock. If the UUIDs don't match, then we've got a - * stale metadata block from an old filesystem instance that we need to recover - * over the top of. - */ -static xfs_lsn_t -xlog_recover_get_buf_lsn( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - uint32_t magic32; - uint16_t magic16; - uint16_t magicda; - void *blk = bp->b_addr; - uuid_t *uuid; - xfs_lsn_t lsn = -1; - - /* v4 filesystems always recover immediately */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - goto recover_immediately; - - magic32 = be32_to_cpu(*(__be32 *)blk); - switch (magic32) { - case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: - case XFS_ABTB_MAGIC: - case XFS_ABTC_MAGIC: - case XFS_RMAP_CRC_MAGIC: - case XFS_REFC_CRC_MAGIC: - case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: { - struct xfs_btree_block *btb = blk; - - lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); - uuid = &btb->bb_u.s.bb_uuid; - break; - } - case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: { - struct xfs_btree_block *btb = blk; - - lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); - uuid = &btb->bb_u.l.bb_uuid; - break; - } - case XFS_AGF_MAGIC: - lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); - uuid = &((struct xfs_agf *)blk)->agf_uuid; - break; - case XFS_AGFL_MAGIC: - lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); - uuid = &((struct xfs_agfl *)blk)->agfl_uuid; - break; - case XFS_AGI_MAGIC: - lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); - uuid = &((struct xfs_agi *)blk)->agi_uuid; - break; - case XFS_SYMLINK_MAGIC: - lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); - uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; - break; - case XFS_DIR3_BLOCK_MAGIC: - case XFS_DIR3_DATA_MAGIC: - case XFS_DIR3_FREE_MAGIC: - lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); - uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; - break; - case XFS_ATTR3_RMT_MAGIC: - /* - * Remote attr blocks are written synchronously, rather than - * being logged. That means they do not contain a valid LSN - * (i.e. transactionally ordered) in them, and hence any time we - * see a buffer to replay over the top of a remote attribute - * block we should simply do so. - */ - goto recover_immediately; - case XFS_SB_MAGIC: - /* - * superblock uuids are magic. We may or may not have a - * sb_meta_uuid on disk, but it will be set in the in-core - * superblock. We set the uuid pointer for verification - * according to the superblock feature mask to ensure we check - * the relevant UUID in the superblock. - */ - lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) - uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; - else - uuid = &((struct xfs_dsb *)blk)->sb_uuid; - break; - default: - break; - } - - if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) - goto recover_immediately; - return lsn; - } - - magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); - switch (magicda) { - case XFS_DIR3_LEAF1_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - case XFS_DA3_NODE_MAGIC: - lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); - uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; - break; - default: - break; - } - - if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) - goto recover_immediately; - return lsn; - } - - /* - * We do individual object checks on dquot and inode buffers as they - * have their own individual LSN records. Also, we could have a stale - * buffer here, so we have to at least recognise these buffer types. - * - * A notd complexity here is inode unlinked list processing - it logs - * the inode directly in the buffer, but we don't know which inodes have - * been modified, and there is no global buffer LSN. Hence we need to - * recover all inode buffer types immediately. This problem will be - * fixed by logical logging of the unlinked list modifications. - */ - magic16 = be16_to_cpu(*(__be16 *)blk); - switch (magic16) { - case XFS_DQUOT_MAGIC: - case XFS_DINODE_MAGIC: - goto recover_immediately; - default: - break; - } - - /* unknown buffer contents, recover immediately */ - -recover_immediately: - return (xfs_lsn_t)-1; - -} - -/* - * Validate the recovered buffer is of the correct type and attach the - * appropriate buffer operations to them for writeback. Magic numbers are in a - * few places: - * the first 16 bits of the buffer (inode buffer, dquot buffer), - * the first 32 bits of the buffer (most blocks), - * inside a struct xfs_da_blkinfo at the start of the buffer. - */ -static void -xlog_recover_validate_buf_type( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f, - xfs_lsn_t current_lsn) -{ - struct xfs_da_blkinfo *info = bp->b_addr; - uint32_t magic32; - uint16_t magic16; - uint16_t magicda; - char *warnmsg = NULL; - - /* - * We can only do post recovery validation on items on CRC enabled - * fielsystems as we need to know when the buffer was written to be able - * to determine if we should have replayed the item. If we replay old - * metadata over a newer buffer, then it will enter a temporarily - * inconsistent state resulting in verification failures. Hence for now - * just avoid the verification stage for non-crc filesystems - */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); - magic16 = be16_to_cpu(*(__be16*)bp->b_addr); - magicda = be16_to_cpu(info->magic); - switch (xfs_blft_from_flags(buf_f)) { - case XFS_BLFT_BTREE_BUF: - switch (magic32) { - case XFS_ABTB_CRC_MAGIC: - case XFS_ABTB_MAGIC: - bp->b_ops = &xfs_bnobt_buf_ops; - break; - case XFS_ABTC_CRC_MAGIC: - case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_cntbt_buf_ops; - break; - case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: - bp->b_ops = &xfs_inobt_buf_ops; - break; - case XFS_FIBT_CRC_MAGIC: - case XFS_FIBT_MAGIC: - bp->b_ops = &xfs_finobt_buf_ops; - break; - case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: - bp->b_ops = &xfs_bmbt_buf_ops; - break; - case XFS_RMAP_CRC_MAGIC: - bp->b_ops = &xfs_rmapbt_buf_ops; - break; - case XFS_REFC_CRC_MAGIC: - bp->b_ops = &xfs_refcountbt_buf_ops; - break; - default: - warnmsg = "Bad btree block magic!"; - break; - } - break; - case XFS_BLFT_AGF_BUF: - if (magic32 != XFS_AGF_MAGIC) { - warnmsg = "Bad AGF block magic!"; - break; - } - bp->b_ops = &xfs_agf_buf_ops; - break; - case XFS_BLFT_AGFL_BUF: - if (magic32 != XFS_AGFL_MAGIC) { - warnmsg = "Bad AGFL block magic!"; - break; - } - bp->b_ops = &xfs_agfl_buf_ops; - break; - case XFS_BLFT_AGI_BUF: - if (magic32 != XFS_AGI_MAGIC) { - warnmsg = "Bad AGI block magic!"; - break; - } - bp->b_ops = &xfs_agi_buf_ops; - break; - case XFS_BLFT_UDQUOT_BUF: - case XFS_BLFT_PDQUOT_BUF: - case XFS_BLFT_GDQUOT_BUF: -#ifdef CONFIG_XFS_QUOTA - if (magic16 != XFS_DQUOT_MAGIC) { - warnmsg = "Bad DQUOT block magic!"; - break; - } - bp->b_ops = &xfs_dquot_buf_ops; -#else - xfs_alert(mp, - "Trying to recover dquots without QUOTA support built in!"); - ASSERT(0); -#endif - break; - case XFS_BLFT_DINO_BUF: - if (magic16 != XFS_DINODE_MAGIC) { - warnmsg = "Bad INODE block magic!"; - break; - } - bp->b_ops = &xfs_inode_buf_ops; - break; - case XFS_BLFT_SYMLINK_BUF: - if (magic32 != XFS_SYMLINK_MAGIC) { - warnmsg = "Bad symlink block magic!"; - break; - } - bp->b_ops = &xfs_symlink_buf_ops; - break; - case XFS_BLFT_DIR_BLOCK_BUF: - if (magic32 != XFS_DIR2_BLOCK_MAGIC && - magic32 != XFS_DIR3_BLOCK_MAGIC) { - warnmsg = "Bad dir block magic!"; - break; - } - bp->b_ops = &xfs_dir3_block_buf_ops; - break; - case XFS_BLFT_DIR_DATA_BUF: - if (magic32 != XFS_DIR2_DATA_MAGIC && - magic32 != XFS_DIR3_DATA_MAGIC) { - warnmsg = "Bad dir data magic!"; - break; - } - bp->b_ops = &xfs_dir3_data_buf_ops; - break; - case XFS_BLFT_DIR_FREE_BUF: - if (magic32 != XFS_DIR2_FREE_MAGIC && - magic32 != XFS_DIR3_FREE_MAGIC) { - warnmsg = "Bad dir3 free magic!"; - break; - } - bp->b_ops = &xfs_dir3_free_buf_ops; - break; - case XFS_BLFT_DIR_LEAF1_BUF: - if (magicda != XFS_DIR2_LEAF1_MAGIC && - magicda != XFS_DIR3_LEAF1_MAGIC) { - warnmsg = "Bad dir leaf1 magic!"; - break; - } - bp->b_ops = &xfs_dir3_leaf1_buf_ops; - break; - case XFS_BLFT_DIR_LEAFN_BUF: - if (magicda != XFS_DIR2_LEAFN_MAGIC && - magicda != XFS_DIR3_LEAFN_MAGIC) { - warnmsg = "Bad dir leafn magic!"; - break; - } - bp->b_ops = &xfs_dir3_leafn_buf_ops; - break; - case XFS_BLFT_DA_NODE_BUF: - if (magicda != XFS_DA_NODE_MAGIC && - magicda != XFS_DA3_NODE_MAGIC) { - warnmsg = "Bad da node magic!"; - break; - } - bp->b_ops = &xfs_da3_node_buf_ops; - break; - case XFS_BLFT_ATTR_LEAF_BUF: - if (magicda != XFS_ATTR_LEAF_MAGIC && - magicda != XFS_ATTR3_LEAF_MAGIC) { - warnmsg = "Bad attr leaf magic!"; - break; - } - bp->b_ops = &xfs_attr3_leaf_buf_ops; - break; - case XFS_BLFT_ATTR_RMT_BUF: - if (magic32 != XFS_ATTR3_RMT_MAGIC) { - warnmsg = "Bad attr remote magic!"; - break; - } - bp->b_ops = &xfs_attr3_rmt_buf_ops; - break; - case XFS_BLFT_SB_BUF: - if (magic32 != XFS_SB_MAGIC) { - warnmsg = "Bad SB block magic!"; - break; - } - bp->b_ops = &xfs_sb_buf_ops; - break; -#ifdef CONFIG_XFS_RT - case XFS_BLFT_RTBITMAP_BUF: - case XFS_BLFT_RTSUMMARY_BUF: - /* no magic numbers for verification of RT buffers */ - bp->b_ops = &xfs_rtbuf_ops; - break; -#endif /* CONFIG_XFS_RT */ - default: - xfs_warn(mp, "Unknown buffer type %d!", - xfs_blft_from_flags(buf_f)); - break; - } - - /* - * Nothing else to do in the case of a NULL current LSN as this means - * the buffer is more recent than the change in the log and will be - * skipped. - */ - if (current_lsn == NULLCOMMITLSN) - return; - - if (warnmsg) { - xfs_warn(mp, warnmsg); - ASSERT(0); - } - - /* - * We must update the metadata LSN of the buffer as it is written out to - * ensure that older transactions never replay over this one and corrupt - * the buffer. This can occur if log recovery is interrupted at some - * point after the current transaction completes, at which point a - * subsequent mount starts recovery from the beginning. - * - * Write verifiers update the metadata LSN from log items attached to - * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. - */ - if (bp->b_ops) { - struct xfs_buf_log_item *bip; - - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_item_init(bp, mp); - bip = bp->b_log_item; - bip->bli_item.li_lsn = current_lsn; - } -} - -/* - * Perform a 'normal' buffer recovery. Each logged region of the - * buffer should be copied over the corresponding region in the - * given buffer. The bitmap in the buf log format structure indicates - * where to place the logged data. - */ -STATIC void -xlog_recover_do_reg_buffer( - struct xfs_mount *mp, - xlog_recover_item_t *item, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f, - xfs_lsn_t current_lsn) -{ - int i; - int bit; - int nbits; - xfs_failaddr_t fa; - const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); - - trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); - - bit = 0; - i = 1; /* 0 is the buf format structure */ - while (1) { - bit = xfs_next_bit(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - if (bit == -1) - break; - nbits = xfs_contig_bits(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(BBTOB(bp->b_length) >= - ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); - - /* - * The dirty regions logged in the buffer, even though - * contiguous, may span multiple chunks. This is because the - * dirty region may span a physical page boundary in a buffer - * and hence be split into two separate vectors for writing into - * the log. Hence we need to trim nbits back to the length of - * the current region being copied out of the log. - */ - if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) - nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; - - /* - * Do a sanity check if this is a dquot buffer. Just checking - * the first dquot in the buffer should do. XXXThis is - * probably a good thing to do for other buf types also. - */ - fa = NULL; - if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { - xfs_alert(mp, - "XFS: NULL dquot in %s.", __func__); - goto next; - } - if (item->ri_buf[i].i_len < size_disk_dquot) { - xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); - goto next; - } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0); - if (fa) { - xfs_alert(mp, - "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, bp->b_bn); - goto next; - } - } - - memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLF_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLF_SHIFT); /* length */ - next: - i++; - bit += nbits; - } - - /* Shouldn't be any more regions */ - ASSERT(i == item->ri_total); - - xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); -} - -/* - * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF log item of the same type - * (ie. USR or GRP), then just toss this buffer away; don't recover it. - * Else, treat it as a regular buffer and do recovery. - * - * Return false if the buffer was tossed and true if we recovered the buffer to - * indicate to the caller if the buffer needs writing. - */ -STATIC bool -xlog_recover_do_dquot_buffer( - struct xfs_mount *mp, - struct xlog *log, - struct xlog_recover_item *item, - struct xfs_buf *bp, - struct xfs_buf_log_format *buf_f) -{ - uint type; - - trace_xfs_log_recover_buf_dquot_buf(log, buf_f); - - /* - * Filesystems are required to send in quota flags at mount time. - */ - if (!mp->m_qflags) - return false; - - type = 0; - if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) - type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) - type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) - type |= XFS_DQ_GROUP; - /* - * This type of quotas was turned off, so ignore this buffer - */ - if (log->l_quotaoffs_flag & type) - return false; - - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); - return true; -} - -/* - * This routine replays a modification made to a buffer at runtime. - * There are actually two types of buffer, regular and inode, which - * are handled differently. Inode buffers are handled differently - * in that we only recover a specific set of data from them, namely - * the inode di_next_unlinked fields. This is because all other inode - * data is actually logged via inode records and any data we replay - * here which overlaps that may be stale. - * - * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLF_CANCEL bit set to indicate that previous copies - * of the buffer in the log should not be replayed at recovery time. - * This is so that if the blocks covered by the buffer are reused for - * file data before we crash we don't end up replaying old, freed - * meta-data into a user's file. - * - * To handle the cancellation of buffer log items, we make two passes - * over the log during recovery. During the first we build a table of - * those buffers which have been cancelled, and during the second we - * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_buffer_pass[1,2] above - * for more details on the implementation of the table of cancel records. - */ -STATIC int -xlog_recover_buffer_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - int error; - uint buf_flags; - xfs_lsn_t lsn; - - /* - * In this pass we only want to recover all the buffers which have - * not been cancelled and are not cancellation buffers themselves. - */ - if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, - buf_f->blf_len, buf_f->blf_flags)) { - trace_xfs_log_recover_buf_cancel(log, buf_f); - return 0; - } - - trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); - if (error) - return error; - - /* - * Recover the buffer only if we get an LSN from it and it's less than - * the lsn of the transaction we are replaying. - * - * Note that we have to be extremely careful of readahead here. - * Readahead does not attach verfiers to the buffers so if we don't - * actually do any replay after readahead because of the LSN we found - * in the buffer if more recent than that current transaction then we - * need to attach the verifier directly. Failure to do so can lead to - * future recovery actions (e.g. EFI and unlinked list recovery) can - * operate on the buffers and they won't get the verifier attached. This - * can lead to blocks on disk having the correct content but a stale - * CRC. - * - * It is safe to assume these clean buffers are currently up to date. - * If the buffer is dirtied by a later transaction being replayed, then - * the verifier will be reset to match whatever recover turns that - * buffer into. - */ - lsn = xlog_recover_get_buf_lsn(mp, bp); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_buf_skip(log, buf_f); - xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); - goto out_release; - } - - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - if (error) - goto out_release; - } else if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - bool dirty; - - dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); - if (!dirty) - goto out_release; - } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); - } - - /* - * Perform delayed write on the buffer. Asynchronous writes will be - * slower when taking into account all the buffers to be flushed. - * - * Also make sure that only inode buffers with good sizes stay in - * the buffer cache. The kernel moves inodes in buffers of 1 block - * or inode_cluster_size bytes, whichever is bigger. The inode - * buffers in the log can be a different size if the log was generated - * by an older kernel using unclustered inode buffers or a newer kernel - * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't max(blocksize, inode_cluster_size) - * for *our* value of inode_cluster_size, then we need to keep - * the buffer out of the buffer cache so that the buffer won't - * overlap with future reads of those inodes. - */ - if (XFS_DINODE_MAGIC == - be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { - xfs_buf_stale(bp); - error = xfs_bwrite(bp); - } else { - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - } - -out_release: - xfs_buf_relse(bp); - return error; -} - -/* - * Inode fork owner changes - * - * If we have been told that we have to reparent the inode fork, it's because an - * extent swap operation on a CRC enabled filesystem has been done and we are - * replaying it. We need to walk the BMBT of the appropriate fork and change the - * owners of it. - * - * The complexity here is that we don't have an inode context to work with, so - * after we've replayed the inode we need to instantiate one. This is where the - * fun begins. - * - * We are in the middle of log recovery, so we can't run transactions. That - * means we cannot use cache coherent inode instantiation via xfs_iget(), as - * that will result in the corresponding iput() running the inode through - * xfs_inactive(). If we've just replayed an inode core that changes the link - * count to zero (i.e. it's been unlinked), then xfs_inactive() will run - * transactions (bad!). - * - * So, to avoid this, we instantiate an inode directly from the inode core we've - * just recovered. We have the buffer still locked, and all we really need to - * instantiate is the inode core and the forks being modified. We can do this - * manually, then run the inode btree owner change, and then tear down the - * xfs_inode without having to run any transactions at all. - * - * Also, because we don't have a transaction context available here but need to - * gather all the buffers we modify for writeback so we pass the buffer_list - * instead for the operation to use. - */ - -STATIC int -xfs_recover_inode_owner_change( - struct xfs_mount *mp, - struct xfs_dinode *dip, - struct xfs_inode_log_format *in_f, - struct list_head *buffer_list) -{ - struct xfs_inode *ip; - int error; - - ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); - - ip = xfs_inode_alloc(mp, in_f->ilf_ino); - if (!ip) - return -ENOMEM; - - /* instantiate the inode */ - xfs_inode_from_disk(ip, dip); - ASSERT(ip->i_d.di_version >= 3); - - error = xfs_iformat_fork(ip, dip); - if (error) - goto out_free_ip; - - if (!xfs_inode_verify_forks(ip)) { - error = -EFSCORRUPTED; - goto out_free_ip; - } - - if (in_f->ilf_fields & XFS_ILOG_DOWNER) { - ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); - error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, - ip->i_ino, buffer_list); - if (error) - goto out_free_ip; - } - - if (in_f->ilf_fields & XFS_ILOG_AOWNER) { - ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); - error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, - ip->i_ino, buffer_list); - if (error) - goto out_free_ip; - } - -out_free_ip: - xfs_inode_free(ip); - return error; -} - -STATIC int -xlog_recover_inode_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - struct xfs_inode_log_format *in_f; - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - xfs_dinode_t *dip; - int len; - char *src; - char *dest; - int error; - int attr_index; - uint fields; - struct xfs_log_dinode *ldip; - uint isize; - int need_free = 0; - - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - in_f = item->ri_buf[0].i_addr; - } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); - need_free = 1; - error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); - if (error) - goto error; - } - - /* - * Inode buffers can be freed, look out for it, - * and do not replay the inode. - */ - if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, - in_f->ilf_len, 0)) { - error = 0; - trace_xfs_log_recover_inode_cancel(log, in_f); - goto error; - } - trace_xfs_log_recover_inode_recover(log, in_f); - - error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - 0, &bp, &xfs_inode_buf_ops); - if (error) - goto error; - ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = xfs_buf_offset(bp, in_f->ilf_boffset); - - /* - * Make sure the place we're flushing out to really looks - * like an inode! - */ - if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { - xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", - __func__, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - ldip = item->ri_buf[1].i_addr; - if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", - __func__, item, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - - /* - * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. Note: we still - * need to replay an owner change even though the inode is more recent - * than the transaction as there is no guarantee that all the btree - * blocks are more recent than this transaction, too. - */ - if (dip->di_version >= 3) { - xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); - - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_owner_change; - } - } - - /* - * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes - * are transactional and if ordering is necessary we can determine that - * more accurately by the LSN field in the V3 inode core. Don't trust - * the inode versions we might be changing them here - use the - * superblock flag to determine whether we need to look at di_flushiter - * to skip replay when the on disk inode is newer than the log one - */ - if (!xfs_sb_version_hascrc(&mp->m_sb) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; - } - } - - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; - - if (unlikely(S_ISREG(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - } else if (unlikely(S_ISDIR(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE) && - (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; - goto out_release; - } - if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); - error = -EFSCORRUPTED; - goto out_release; - } - isize = xfs_log_dinode_size(ldip->di_version); - if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); - error = -EFSCORRUPTED; - goto out_release; - } - - /* recover the log dinode inode into the on disk inode */ - xfs_log_dinode_to_disk(ldip, dip); - - fields = in_f->ilf_fields; - if (fields & XFS_ILOG_DEV) - xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); - - if (in_f->ilf_size == 2) - goto out_owner_change; - len = item->ri_buf[2].i_len; - src = item->ri_buf[2].i_addr; - ASSERT(in_f->ilf_size <= 4); - ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); - ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); - - switch (fields & XFS_ILOG_DFORK) { - case XFS_ILOG_DDATA: - case XFS_ILOG_DEXT: - memcpy(XFS_DFORK_DPTR(dip), src, len); - break; - - case XFS_ILOG_DBROOT: - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, - (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), - XFS_DFORK_DSIZE(dip, mp)); - break; - - default: - /* - * There are no data fork flags set. - */ - ASSERT((fields & XFS_ILOG_DFORK) == 0); - break; - } - - /* - * If we logged any attribute data, recover it. There may or - * may not have been any other non-core data logged in this - * transaction. - */ - if (in_f->ilf_fields & XFS_ILOG_AFORK) { - if (in_f->ilf_fields & XFS_ILOG_DFORK) { - attr_index = 3; - } else { - attr_index = 2; - } - len = item->ri_buf[attr_index].i_len; - src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); - - switch (in_f->ilf_fields & XFS_ILOG_AFORK) { - case XFS_ILOG_ADATA: - case XFS_ILOG_AEXT: - dest = XFS_DFORK_APTR(dip); - ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); - memcpy(dest, src, len); - break; - - case XFS_ILOG_ABROOT: - dest = XFS_DFORK_APTR(dip); - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, - len, (xfs_bmdr_block_t*)dest, - XFS_DFORK_ASIZE(dip, mp)); - break; - - default: - xfs_warn(log->l_mp, "%s: Invalid flag", __func__); - ASSERT(0); - error = -EFSCORRUPTED; - goto out_release; - } - } - -out_owner_change: - /* Recover the swapext owner change unless inode has been deleted */ - if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && - (dip->di_mode != 0)) - error = xfs_recover_inode_owner_change(mp, dip, in_f, - buffer_list); - /* re-generate the checksum. */ - xfs_dinode_calc_crc(log->l_mp, dip); - - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - -out_release: - xfs_buf_relse(bp); -error: - if (need_free) - kmem_free(in_f); - return error; -} - -/* - * Recover QUOTAOFF records. We simply make a note of it in the xlog - * structure, so that we know not to do any dquot item or dquot buffer recovery, - * of that type. - */ -STATIC int -xlog_recover_quotaoff_pass1( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; - ASSERT(qoff_f); - - /* - * The logitem format's flag tells us if this was user quotaoff, - * group/project quotaoff or both. - */ - if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_USER; - if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_PROJ; - if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_GROUP; - - return 0; -} - -/* - * Recover a dquot record - */ -STATIC int -xlog_recover_dquot_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - struct xfs_disk_dquot *ddq, *recddq; - xfs_failaddr_t fa; - int error; - xfs_dq_logformat_t *dq_f; - uint type; - - - /* - * Filesystems are required to send in quota flags at mount time. - */ - if (mp->m_qflags == 0) - return 0; - - recddq = item->ri_buf[1].i_addr; - if (recddq == NULL) { - xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return -EFSCORRUPTED; - } - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { - xfs_alert(log->l_mp, "dquot too small (%d) in %s.", - item->ri_buf[1].i_len, __func__); - return -EFSCORRUPTED; - } - - /* - * This type of quotas was turned off, so ignore this record. - */ - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); - ASSERT(type); - if (log->l_quotaoffs_flag & type) - return 0; - - /* - * At this point we know that quota was _not_ turned off. - * Since the mount flags are not indicating to us otherwise, this - * must mean that quota is on, and the dquot needs to be replayed. - * Remember that we may not have fully recovered the superblock yet, - * so we can't do the usual trick of looking at the SB quota bits. - * - * The other possibility, of course, is that the quota subsystem was - * removed since the last mount - ENOSYS. - */ - dq_f = item->ri_buf[0].i_addr; - ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); - if (fa) { - xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", - dq_f->qlf_id, fa); - return -EFSCORRUPTED; - } - ASSERT(dq_f->qlf_len == 1); - - /* - * At this point we are assuming that the dquots have been allocated - * and hence the buffer has valid dquots stamped in it. It should, - * therefore, pass verifier validation. If the dquot is bad, then the - * we'll return an error here, so we don't need to specifically check - * the dquot in the buffer after the verifier has run. - */ - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, - &xfs_dquot_buf_ops); - if (error) - return error; - - ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); - - /* - * If the dquot has an LSN in it, recover the dquot only if it's less - * than the lsn of the transaction we are replaying. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; - xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); - - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - goto out_release; - } - } - - memcpy(ddq, recddq, item->ri_buf[1].i_len); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF); - } - - ASSERT(dq_f->qlf_size == 2); - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - -out_release: - xfs_buf_relse(bp); - return 0; -} - -/* - * This routine is called to create an in-core extent free intent - * item from the efi format structure which was logged on disk. - * It allocates an in-core efi, copies the extents from the format - * structure into it, and adds the efi to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_efi_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_efi_log_item *efip; - struct xfs_efi_log_format *efi_formatp; - - efi_formatp = item->ri_buf[0].i_addr; - - efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); - if (error) { - xfs_efi_item_free(efip); - return error; - } - atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The EFI has two references. One for the EFD and one for EFI to ensure - * it makes it into the AIL. Insert the EFI into the AIL directly and - * drop the EFI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); - return 0; -} - - -/* - * This routine is called when an EFD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding EFI if it - * was still in the log. To do this it searches the AIL for the EFI with an id - * equal to that in the EFD format structure. If we find it we drop the EFD - * reference, which removes the EFI from the AIL and frees it. - */ -STATIC int -xlog_recover_efd_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_efd_log_format_t *efd_formatp; - xfs_efi_log_item_t *efip = NULL; - struct xfs_log_item *lip; - uint64_t efi_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); - efi_id = efd_formatp->efd_efi_id; - - /* - * Search for the EFI with the id in the EFD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_EFI) { - efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_format.efi_id == efi_id) { - /* - * Drop the EFD reference to the EFI. This - * removes the EFI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_efi_release(efip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * This routine is called to create an in-core extent rmap update - * item from the rui format structure which was logged on disk. - * It allocates an in-core rui, copies the extents from the format - * structure into it, and adds the rui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_rui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_rui_log_item *ruip; - struct xfs_rui_log_format *rui_formatp; - - rui_formatp = item->ri_buf[0].i_addr; - - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; - } - atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The RUI has two references. One for the RUD and one for RUI to ensure - * it makes it into the AIL. Insert the RUI into the AIL directly and - * drop the RUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); - return 0; -} - - -/* - * This routine is called when an RUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding RUI if it - * was still in the log. To do this it searches the AIL for the RUI with an id - * equal to that in the RUD format structure. If we find it we drop the RUD - * reference, which removes the RUI from the AIL and frees it. - */ -STATIC int -xlog_recover_rud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_rud_log_format *rud_formatp; - struct xfs_rui_log_item *ruip = NULL; - struct xfs_log_item *lip; - uint64_t rui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); - rui_id = rud_formatp->rud_rui_id; - - /* - * Search for the RUI with the id in the RUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_RUI) { - ruip = (struct xfs_rui_log_item *)lip; - if (ruip->rui_format.rui_id == rui_id) { - /* - * Drop the RUD reference to the RUI. This - * removes the RUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_rui_release(ruip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int -xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) -{ - struct xfs_cui_log_format *src_cui_fmt; - uint len; - - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); - - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; -} - -/* - * This routine is called to create an in-core extent refcount update - * item from the cui format structure which was logged on disk. - * It allocates an in-core cui, copies the extents from the format - * structure into it, and adds the cui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_cui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_cui_log_item *cuip; - struct xfs_cui_log_format *cui_formatp; - - cui_formatp = item->ri_buf[0].i_addr; - - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; - } - atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The CUI has two references. One for the CUD and one for CUI to ensure - * it makes it into the AIL. Insert the CUI into the AIL directly and - * drop the CUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); - return 0; -} - - -/* - * This routine is called when an CUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding CUI if it - * was still in the log. To do this it searches the AIL for the CUI with an id - * equal to that in the CUD format structure. If we find it we drop the CUD - * reference, which removes the CUI from the AIL and frees it. - */ -STATIC int -xlog_recover_cud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_cud_log_format *cud_formatp; - struct xfs_cui_log_item *cuip = NULL; - struct xfs_log_item *lip; - uint64_t cui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - cui_id = cud_formatp->cud_cui_id; - - /* - * Search for the CUI with the id in the CUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_CUI) { - cuip = (struct xfs_cui_log_item *)lip; - if (cuip->cui_format.cui_id == cui_id) { - /* - * Drop the CUD reference to the CUI. This - * removes the CUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_cui_release(cuip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int -xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) -{ - struct xfs_bui_log_format *src_bui_fmt; - uint len; - - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); - - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; -} - -/* - * This routine is called to create an in-core extent bmap update - * item from the bui format structure which was logged on disk. - * It allocates an in-core bui, copies the extents from the format - * structure into it, and adds the bui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_bui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_bui_log_item *buip; - struct xfs_bui_log_format *bui_formatp; - - bui_formatp = item->ri_buf[0].i_addr; - - if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; - } - atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The RUI has two references. One for the RUD and one for RUI to ensure - * it makes it into the AIL. Insert the RUI into the AIL directly and - * drop the RUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); - return 0; -} - - -/* - * This routine is called when an BUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding BUI if it - * was still in the log. To do this it searches the AIL for the BUI with an id - * equal to that in the BUD format structure. If we find it we drop the BUD - * reference, which removes the BUI from the AIL and frees it. - */ -STATIC int -xlog_recover_bud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_bud_log_format *bud_formatp; - struct xfs_bui_log_item *buip = NULL; - struct xfs_log_item *lip; - uint64_t bui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - bui_id = bud_formatp->bud_bui_id; - - /* - * Search for the BUI with the id in the BUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_BUI) { - buip = (struct xfs_bui_log_item *)lip; - if (buip->bui_format.bui_id == bui_id) { - /* - * Drop the BUD reference to the BUI. This - * removes the BUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_bui_release(buip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * This routine is called when an inode create format structure is found in a - * committed transaction in the log. It's purpose is to initialise the inodes - * being allocated on disk. This requires us to get inode cluster buffers that - * match the range to be initialised, stamped with inode templates and written - * by delayed write so that subsequent modifications will hit the cached buffer - * and only need writing out at the end of recovery. - */ -STATIC int -xlog_recover_do_icreate_pass2( - struct xlog *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_icreate_log *icl; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_agnumber_t agno; - xfs_agblock_t agbno; - unsigned int count; - unsigned int isize; - xfs_agblock_t length; - int bb_per_cluster; - int cancel_count; - int nbufs; - int i; - - icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; - if (icl->icl_type != XFS_LI_ICREATE) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); - return -EINVAL; - } - - if (icl->icl_size != 1) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); - return -EINVAL; - } - - agno = be32_to_cpu(icl->icl_ag); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); - return -EINVAL; - } - agbno = be32_to_cpu(icl->icl_agbno); - if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); - return -EINVAL; - } - isize = be32_to_cpu(icl->icl_isize); - if (isize != mp->m_sb.sb_inodesize) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); - return -EINVAL; - } - count = be32_to_cpu(icl->icl_count); - if (!count) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); - return -EINVAL; - } - length = be32_to_cpu(icl->icl_length); - if (!length || length >= mp->m_sb.sb_agblocks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); - return -EINVAL; - } - - /* - * The inode chunk is either full or sparse and we only support - * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. - */ - if (length != igeo->ialloc_blks && - length != igeo->ialloc_min_blks) { - xfs_warn(log->l_mp, - "%s: unsupported chunk length", __FUNCTION__); - return -EINVAL; - } - - /* verify inode count is consistent with extent length */ - if ((count >> mp->m_sb.sb_inopblog) != length) { - xfs_warn(log->l_mp, - "%s: inconsistent inode count and chunk length", - __FUNCTION__); - return -EINVAL; - } - - /* - * The icreate transaction can cover multiple cluster buffers and these - * buffers could have been freed and reused. Check the individual - * buffers for cancellation so we don't overwrite anything written after - * a cancellation. - */ - bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); - nbufs = length / igeo->blocks_per_cluster; - for (i = 0, cancel_count = 0; i < nbufs; i++) { - xfs_daddr_t daddr; - - daddr = XFS_AGB_TO_DADDR(mp, agno, - agbno + i * igeo->blocks_per_cluster); - if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) - cancel_count++; - } - - /* - * We currently only use icreate for a single allocation at a time. This - * means we should expect either all or none of the buffers to be - * cancelled. Be conservative and skip replay if at least one buffer is - * cancelled, but warn the user that something is awry if the buffers - * are not consistent. - * - * XXX: This must be refined to only skip cancelled clusters once we use - * icreate for multiple chunk allocations. - */ - ASSERT(!cancel_count || cancel_count == nbufs); - if (cancel_count) { - if (cancel_count != nbufs) - xfs_warn(mp, - "WARNING: partial inode chunk cancellation, skipped icreate."); - trace_xfs_log_recover_icreate_cancel(log, icl); - return 0; - } - - trace_xfs_log_recover_icreate_recover(log, icl); - return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, - length, be32_to_cpu(icl->icl_gen)); -} - -STATIC void -xlog_recover_buffer_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; - struct xfs_mount *mp = log->l_mp; - - if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, - buf_f->blf_len, buf_f->blf_flags)) { - return; - } - - xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, - buf_f->blf_len, NULL); -} - -STATIC void -xlog_recover_inode_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_inode_log_format ilf_buf; - struct xfs_inode_log_format *ilfp; - struct xfs_mount *mp = log->l_mp; - int error; - - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - ilfp = item->ri_buf[0].i_addr; - } else { - ilfp = &ilf_buf; - memset(ilfp, 0, sizeof(*ilfp)); - error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); - if (error) - return; - } - - if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) - return; - - xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, - ilfp->ilf_len, &xfs_inode_buf_ra_ops); -} - -STATIC void -xlog_recover_dquot_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_disk_dquot *recddq; - struct xfs_dq_logformat *dq_f; - uint type; - int len; - - - if (mp->m_qflags == 0) - return; - - recddq = item->ri_buf[1].i_addr; - if (recddq == NULL) - return; - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) - return; - - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); - ASSERT(type); - if (log->l_quotaoffs_flag & type) - return; - - dq_f = item->ri_buf[0].i_addr; - ASSERT(dq_f); - ASSERT(dq_f->qlf_len == 1); - - len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); - if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) - return; - - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, - &xfs_dquot_buf_ra_ops); -} - -STATIC void -xlog_recover_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - xlog_recover_buffer_ra_pass2(log, item); - break; - case XFS_LI_INODE: - xlog_recover_inode_ra_pass2(log, item); - break; - case XFS_LI_DQUOT: - xlog_recover_dquot_ra_pass2(log, item); - break; - case XFS_LI_EFI: - case XFS_LI_EFD: - case XFS_LI_QUOTAOFF: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - default: - break; - } -} - -STATIC int -xlog_recover_commit_pass1( - struct xlog *log, - struct xlog_recover *trans, - struct xlog_recover_item *item) -{ - trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); - - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - return xlog_recover_buffer_pass1(log, item); - case XFS_LI_QUOTAOFF: - return xlog_recover_quotaoff_pass1(log, item); - case XFS_LI_INODE: - case XFS_LI_EFI: - case XFS_LI_EFD: - case XFS_LI_DQUOT: - case XFS_LI_ICREATE: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - /* nothing to do in pass 1 */ - return 0; - default: - xfs_warn(log->l_mp, "%s: invalid item type (%d)", - __func__, ITEM_TYPE(item)); - ASSERT(0); - return -EFSCORRUPTED; - } -} - -STATIC int -xlog_recover_commit_pass2( - struct xlog *log, - struct xlog_recover *trans, - struct list_head *buffer_list, - struct xlog_recover_item *item) -{ - trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); - - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_EFI: - return xlog_recover_efi_pass2(log, item, trans->r_lsn); - case XFS_LI_EFD: - return xlog_recover_efd_pass2(log, item); - case XFS_LI_RUI: - return xlog_recover_rui_pass2(log, item, trans->r_lsn); - case XFS_LI_RUD: - return xlog_recover_rud_pass2(log, item); - case XFS_LI_CUI: - return xlog_recover_cui_pass2(log, item, trans->r_lsn); - case XFS_LI_CUD: - return xlog_recover_cud_pass2(log, item); - case XFS_LI_BUI: - return xlog_recover_bui_pass2(log, item, trans->r_lsn); - case XFS_LI_BUD: - return xlog_recover_bud_pass2(log, item); - case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_ICREATE: - return xlog_recover_do_icreate_pass2(log, buffer_list, item); - case XFS_LI_QUOTAOFF: - /* nothing to do in pass2 */ - return 0; - default: - xfs_warn(log->l_mp, "%s: invalid item type (%d)", - __func__, ITEM_TYPE(item)); - ASSERT(0); - return -EFSCORRUPTED; - } + if (!xlog_is_buffer_cancelled(log, blkno, len)) + xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } STATIC int @@ -4072,8 +1966,12 @@ xlog_recover_items_pass2( int error = 0; list_for_each_entry(item, item_list, ri_list) { - error = xlog_recover_commit_pass2(log, trans, - buffer_list, item); + trace_xfs_log_recover_item_recover(log, trans, item, + XLOG_RECOVER_PASS2); + + if (item->ri_ops->commit_pass2) + error = item->ri_ops->commit_pass2(log, buffer_list, + item, trans->r_lsn); if (error) return error; } @@ -4110,12 +2008,16 @@ xlog_recover_commit_trans( return error; list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); + switch (pass) { case XLOG_RECOVER_PASS1: - error = xlog_recover_commit_pass1(log, trans, item); + if (item->ri_ops->commit_pass1) + error = item->ri_ops->commit_pass1(log, item); break; case XLOG_RECOVER_PASS2: - xlog_recover_ra_pass2(log, item); + if (item->ri_ops->ra_pass2) + item->ri_ops->ra_pass2(log, item); list_move_tail(&item->ri_list, &ra_list); items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { @@ -4152,9 +2054,9 @@ STATIC void xlog_recover_add_item( struct list_head *head) { - xlog_recover_item_t *item; + struct xlog_recover_item *item; - item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); + item = kmem_zalloc(sizeof(struct xlog_recover_item), 0); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -4166,7 +2068,7 @@ xlog_recover_add_to_cont_trans( char *dp, int len) { - xlog_recover_item_t *item; + struct xlog_recover_item *item; char *ptr, *old_ptr; int old_len; @@ -4189,7 +2091,8 @@ xlog_recover_add_to_cont_trans( } /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, + ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; @@ -4223,7 +2126,7 @@ xlog_recover_add_to_trans( int len) { struct xfs_inode_log_format *in_f; /* any will do */ - xlog_recover_item_t *item; + struct xlog_recover_item *item; char *ptr; if (!len) @@ -4259,13 +2162,14 @@ xlog_recover_add_to_trans( in_f = (struct xfs_inode_log_format *)ptr; /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, + ri_list); if (item->ri_total != 0 && item->ri_total == item->ri_cnt) { /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); item = list_entry(trans->r_itemq.prev, - xlog_recover_item_t, ri_list); + struct xlog_recover_item, ri_list); } if (item->ri_total == 0) { /* first region to be added */ @@ -4311,7 +2215,7 @@ STATIC void xlog_recover_free_trans( struct xlog_recover *trans) { - xlog_recover_item_t *item, *n; + struct xlog_recover_item *item, *n; int i; hlist_del_init(&trans->r_list); @@ -4563,180 +2467,6 @@ xlog_recover_process_data( return 0; } -/* Recover the EFI if necessary. */ -STATIC int -xlog_recover_process_efi( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_efi_log_item *efip; - int error; - - /* - * Skip EFIs that we've already processed. - */ - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_efi_recover(mp, efip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the EFI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_efi( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_efi_log_item *efip; - - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - - spin_unlock(&ailp->ail_lock); - xfs_efi_release(efip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the RUI if necessary. */ -STATIC int -xlog_recover_process_rui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_rui_log_item *ruip; - int error; - - /* - * Skip RUIs that we've already processed. - */ - ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_rui_recover(mp, ruip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the RUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_rui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_rui_log_item *ruip; - - ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - - spin_unlock(&ailp->ail_lock); - xfs_rui_release(ruip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the CUI if necessary. */ -STATIC int -xlog_recover_process_cui( - struct xfs_trans *parent_tp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_cui_log_item *cuip; - int error; - - /* - * Skip CUIs that we've already processed. - */ - cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_cui_recover(parent_tp, cuip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the CUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_cui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_cui_log_item *cuip; - - cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - - spin_unlock(&ailp->ail_lock); - xfs_cui_release(cuip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the BUI if necessary. */ -STATIC int -xlog_recover_process_bui( - struct xfs_trans *parent_tp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_bui_log_item *buip; - int error; - - /* - * Skip BUIs that we've already processed. - */ - buip = container_of(lip, struct xfs_bui_log_item, bui_item); - if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_bui_recover(parent_tp, buip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the BUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_bui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_bui_log_item *buip; - - buip = container_of(lip, struct xfs_bui_log_item, bui_item); - - spin_unlock(&ailp->ail_lock); - xfs_bui_release(buip); - spin_lock(&ailp->ail_lock); -} - -/* Is this log item a deferred action intent? */ -static inline bool xlog_item_is_intent(struct xfs_log_item *lip) -{ - switch (lip->li_type) { - case XFS_LI_EFI: - case XFS_LI_RUI: - case XFS_LI_CUI: - case XFS_LI_BUI: - return true; - default: - return false; - } -} - /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( @@ -4771,6 +2501,13 @@ xlog_finish_defer_ops( return xfs_trans_commit(tp); } +/* Is this log item a deferred action intent? */ +static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_recover != NULL && + lip->li_ops->iop_match != NULL; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4841,23 +2578,14 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the dfops in this - * routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the transaction in + * this routine or else those subsequent intents will get * replayed in the wrong order! */ - switch (lip->li_type) { - case XFS_LI_EFI: - error = xlog_recover_process_efi(log->l_mp, ailp, lip); - break; - case XFS_LI_RUI: - error = xlog_recover_process_rui(log->l_mp, ailp, lip); - break; - case XFS_LI_CUI: - error = xlog_recover_process_cui(parent_tp, ailp, lip); - break; - case XFS_LI_BUI: - error = xlog_recover_process_bui(parent_tp, ailp, lip); - break; + if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) { + spin_unlock(&ailp->ail_lock); + error = lip->li_ops->iop_recover(lip, parent_tp); + spin_lock(&ailp->ail_lock); } if (error) goto out; @@ -4901,21 +2629,9 @@ xlog_recover_cancel_intents( break; } - switch (lip->li_type) { - case XFS_LI_EFI: - xlog_recover_cancel_efi(log->l_mp, ailp, lip); - break; - case XFS_LI_RUI: - xlog_recover_cancel_rui(log->l_mp, ailp, lip); - break; - case XFS_LI_CUI: - xlog_recover_cancel_cui(log->l_mp, ailp, lip); - break; - case XFS_LI_BUI: - xlog_recover_cancel_bui(log->l_mp, ailp, lip); - break; - } - + spin_unlock(&ailp->ail_lock); + lip->li_ops->iop_release(lip); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_next(ailp, &cur); } @@ -4947,7 +2663,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_abort; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); @@ -4987,7 +2703,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0); if (error) goto fail_iput; @@ -5083,7 +2799,7 @@ xlog_recover_process_iunlinks( * buffer reference though, so that it stays pinned in memory * while we need the buffer. */ - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { @@ -5636,7 +3352,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ @@ -5809,7 +3525,6 @@ xlog_recover_check_summary( struct xlog *log) { xfs_mount_t *mp; - xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; @@ -5829,7 +3544,8 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agf read failed agno %d error %d", __func__, agno, error); } else { - agfp = XFS_BUF_TO_AGF(agfbp); + struct xfs_agf *agfp = agfbp->b_addr; + freeblks += be32_to_cpu(agfp->agf_freeblks) + be32_to_cpu(agfp->agf_flcount); xfs_buf_relse(agfbp); @@ -5840,7 +3556,7 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agi read failed agno %d error %d", __func__, agno, error); } else { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; itotal += be32_to_cpu(agi->agi_count); ifree += be32_to_cpu(agi->agi_freecount); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index e0f9d3b6abe9..bc66d95c8d4c 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -117,3 +117,25 @@ xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } + +void +xfs_buf_alert_ratelimited( + struct xfs_buf *bp, + const char *rlmsg, + const char *fmt, + ...) +{ + struct xfs_mount *mp = bp->b_mount; + struct va_format vaf; + va_list args; + + /* use the more aggressive per-target rate limit for buffers */ + if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); +} diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 0b05e10995a0..4d9bd6bb63ca 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -31,15 +31,27 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif -#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ - func(dev, fmt, ##__VA_ARGS__); \ + func(dev, fmt, ##__VA_ARGS__); \ } while (0) +#define xfs_printk_once(func, dev, fmt, ...) \ +({ \ + static bool __section(.data.once) __print_once; \ + bool __ret_print_once = !__print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + func(dev, fmt, ##__VA_ARGS__); \ + } \ + unlikely(__ret_print_once); \ +}) + #define xfs_emerg_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) #define xfs_alert_ratelimited(dev, fmt, ...) \ @@ -57,9 +69,17 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_once(dev, fmt, ...) \ + xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_once(dev, fmt, ...) \ + xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__) + void assfail(struct xfs_mount *mp, char *expr, char *f, int l); void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); extern void xfs_hex_dump(const void *p, int length); +void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, + const char *fmt, ...); + #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 56efe140c923..c8ae49a1e99c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -148,7 +148,6 @@ xfs_free_perag( ASSERT(atomic_read(&pag->pag_ref) == 0); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); - mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -200,7 +199,6 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); - mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) goto out_free_pag; @@ -242,7 +240,6 @@ xfs_initialize_perag( out_hash_destroy: xfs_buf_hash_destroy(pag); out_free_pag: - mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ @@ -252,7 +249,6 @@ out_unwind_new_pags: break; xfs_buf_hash_destroy(pag); xfs_iunlink_destroy(pag); - mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } return error; @@ -310,7 +306,7 @@ reread: /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); /* * If we haven't validated the superblock, do so now before we try @@ -1015,7 +1011,7 @@ xfs_mountfs( * quota inodes. */ cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_reclaim_inodes(mp); xfs_health_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; @@ -1092,13 +1088,12 @@ xfs_unmountfs( xfs_ail_push_all_sync(mp->m_ail); /* - * And reclaim all inodes. At this point there should be no dirty - * inodes and none should be pinned or locked, but use synchronous - * reclaim just to be sure. We can stop background inode reclaim - * here as well if it is still running. + * Reclaim all inodes. At this point there should be no dirty inodes and + * none should be pinned or locked. Stop background inode reclaim here + * if it is still running. */ cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_reclaim_inodes(mp); xfs_health_unmount(mp); xfs_qm_unmount(mp); @@ -1190,39 +1185,6 @@ xfs_log_sbcount(xfs_mount_t *mp) } /* - * Deltas for the inode count are +/-64, hence we use a large batch size - * of 128 so we don't need to take the counter lock on every update. - */ -#define XFS_ICOUNT_BATCH 128 -int -xfs_mod_icount( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); - if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_icount, -delta); - return -EINVAL; - } - return 0; -} - -int -xfs_mod_ifree( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add(&mp->m_ifree, delta); - if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_ifree, -delta); - return -EINVAL; - } - return 0; -} - -/* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed * allocation path for buffered writes (page a time updates). Hence we set @@ -1300,10 +1262,9 @@ xfs_mod_fdblocks( spin_unlock(&mp->m_sb_lock); return 0; } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_super->s_id); + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + fdblocks_enospc: spin_unlock(&mp->m_sb_lock); return -ENOSPC; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 88ab09ed29e7..a72cfcaa4ad1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -55,61 +55,25 @@ struct xfs_error_cfg { long retry_timeout; /* in jiffies, -1 = infinite */ }; +/* + * The struct xfsmount layout is optimised to separate read-mostly variables + * from variables that are frequently modified. We put the read-mostly variables + * first, then place all the other variables at the end. + * + * Typically, read-mostly variables are those that are set at mount time and + * never changed again, or only change rarely as a result of things like sysfs + * knobs being tweaked. + */ typedef struct xfs_mount { + struct xfs_sb m_sb; /* copy of fs superblock */ struct super_block *m_super; - - /* - * Bitsets of per-fs metadata that have been checked and/or are sick. - * Callers must hold m_sb_lock to access these two fields. - */ - uint8_t m_fs_checked; - uint8_t m_fs_sick; - /* - * Bitsets of rt metadata that have been checked and/or are sick. - * Callers must hold m_sb_lock to access this field. - */ - uint8_t m_rt_checked; - uint8_t m_rt_sick; - struct xfs_ail *m_ail; /* fs active log item list */ - - struct xfs_sb m_sb; /* copy of fs superblock */ - spinlock_t m_sb_lock; /* sb counter lock */ - struct percpu_counter m_icount; /* allocated inodes counter */ - struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - /* - * Count of data device blocks reserved for delayed allocations, - * including indlen blocks. Does not include allocated CoW staging - * extents or anything related to the rt device. - */ - struct percpu_counter m_delalloc_blks; - struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ - int m_bsize; /* fs logical block size */ - xfs_agnumber_t m_agfrotor; /* last ag where space found */ - xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - spinlock_t m_agirotor_lock;/* .. and lock protecting it */ - xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_allocsize_log;/* min write size log bytes */ - uint m_allocsize_blocks; /* min write size blocks */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ - struct xfs_ino_geometry m_ino_geo; /* inode geometry */ - int m_logbufs; /* number of log buffers */ - int m_logbsize; /* size of each log buffer */ - uint m_rsumlevels; /* rt summary levels */ - uint m_rsumsize; /* size of rt summary, bytes */ - /* - * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] <= the minimum i for which - * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip - * inode lock. - */ - uint8_t *m_rsum_cache; struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ @@ -117,9 +81,26 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + /* + * Optional cache of rt summary level per bitmap block with the + * invariant that m_rsum_cache[bbno] <= the minimum i for which + * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip + * inode lock. + */ + uint8_t *m_rsum_cache; + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct workqueue_struct *m_buf_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; + + int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ + uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -138,42 +119,82 @@ typedef struct xfs_mount { xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ - struct radix_tree_root m_perag_tree; /* per-ag accounting info */ - spinlock_t m_perag_lock; /* lock for m_perag_tree */ - struct mutex m_growlock; /* growfs mutex */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_allocsize_log;/* min write size log bytes */ + uint m_allocsize_blocks; /* min write size blocks */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ int m_fixedfsid[2]; /* unchanged for life of FS */ - uint64_t m_flags; /* global mount flags */ - bool m_finobt_nores; /* no per-AG finobt resv. */ uint m_qflags; /* quota status flags */ + uint64_t m_flags; /* global mount flags */ + int64_t m_low_space[XFS_LOWSP_MAX]; + struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ + /* low free space thresholds */ + bool m_always_cow; + bool m_fail_unmount; + bool m_finobt_nores; /* no per-AG finobt resv. */ + bool m_update_sb; /* sb needs update in mount */ + + /* + * Bitsets of per-fs metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access these two fields. + */ + uint8_t m_fs_checked; + uint8_t m_fs_sick; + /* + * Bitsets of rt metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access this field. + */ + uint8_t m_rt_checked; + uint8_t m_rt_sick; + + /* + * End of read-mostly variables. Frequently written variables and locks + * should be placed below this comment from now on. The first variable + * here is marked as cacheline aligned so they it is separated from + * the read-mostly variables. + */ + + spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + /* + * Count of data device blocks reserved for delayed allocations, + * including indlen blocks. Does not include allocated CoW staging + * extents or anything related to the rt device. + */ + struct percpu_counter m_delalloc_blks; + + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ - int m_dalign; /* stripe unit */ - int m_swidth; /* stripe width */ - uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ - atomic_t m_active_trans; /* number trans frozen */ - struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ struct delayed_work m_cowblocks_work; /* background cow blocks trimming */ - bool m_update_sb; /* sb needs update in mount */ - int64_t m_low_space[XFS_LOWSP_MAX]; - /* low free space thresholds */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ - struct workqueue_struct *m_buf_workqueue; - struct workqueue_struct *m_unwritten_workqueue; - struct workqueue_struct *m_cil_workqueue; - struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_eofblocks_workqueue; - struct workqueue_struct *m_sync_workqueue; + /* + * Workqueue item so that we can coalesce multiple inode flush attempts + * into a single flush. + */ + struct work_struct m_flush_inodes_work; /* * Generation of the filesysyem layout. This is incremented by each @@ -185,9 +206,8 @@ typedef struct xfs_mount { * to various other kinds of pain inflicted on the pNFS server. */ uint32_t m_generation; + struct mutex m_growlock; /* growfs mutex */ - bool m_always_cow; - bool m_fail_unmount; #ifdef DEBUG /* * Frequency with which errors are injected. Replaces xfs_etest; the @@ -232,8 +252,8 @@ typedef struct xfs_mount { #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ - -#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ +#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) +#define XFS_MOUNT_DAX_NEVER (1ULL << 27) /* * Max and min values for mount-option defined I/O @@ -254,8 +274,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ -#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ -#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ /* * Flags for xfs_mountfs @@ -336,7 +354,6 @@ typedef struct xfs_perag { spinlock_t pag_ici_lock; /* incore inode cache lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ - struct mutex pag_ici_reclaim_lock; /* serialisation point */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ @@ -389,8 +406,6 @@ extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); -extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index bb3008d390aa..b101feb2aab4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,9 +58,8 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - printk_once(KERN_NOTICE -"XFS (%s): using experimental pNFS feature, use at your own risk!\n", - mp->m_super->s_id); + xfs_notice_once(mp, +"Using experimental pNFS feature, use at your own risk!"); if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c index 4bcc3e61056c..b03333f1c84a 100644 --- a/fs/xfs/xfs_pwork.c +++ b/fs/xfs/xfs_pwork.c @@ -132,5 +132,5 @@ xfs_pwork_guess_datadev_parallelism( * For now we'll go with the most conservative setting possible, * which is two threads for an SSD and 1 thread everywhere else. */ - return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1; + return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0b0909657bad..be67570badf8 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -47,7 +47,7 @@ STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, - int type, + xfs_dqtype_t type, int (*execute)(struct xfs_dquot *dqp, void *data), void *data) { @@ -79,7 +79,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_dquot *dqp = batch[i]; - next_index = be32_to_cpu(dqp->q_core.d_id) + 1; + next_index = dqp->q_id + 1; error = execute(batch[i], data); if (error == -EAGAIN) { @@ -121,14 +121,13 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; + int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - return -EAGAIN; - } + if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) + goto out_unlock; - dqp->dq_flags |= XFS_DQ_FREEING; + dqp->q_flags |= XFS_DQFLAG_FREEING; xfs_dqflock(dqp); @@ -139,7 +138,6 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; - int error; /* * We don't care about getting disk errors here. We need @@ -149,6 +147,9 @@ xfs_qm_dqpurge( if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); + } else if (error == -EAGAIN) { + dqp->q_flags &= ~XFS_DQFLAG_FREEING; + goto out_unlock; } xfs_dqflock(dqp); } @@ -160,8 +161,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), - be32_to_cpu(dqp->q_core.d_id)); + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; /* @@ -174,6 +174,10 @@ xfs_qm_dqpurge( xfs_qm_dqdestroy(dqp); return 0; + +out_unlock: + xfs_dqunlock(dqp); + return error; } /* @@ -185,11 +189,11 @@ xfs_qm_dqpurge_all( uint flags) { if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -246,7 +250,7 @@ STATIC int xfs_qm_dqattach_one( struct xfs_inode *ip, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool doalloc, struct xfs_dquot **IO_idqpp) { @@ -326,23 +330,23 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), + XFS_DQTYPE_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), + XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -469,7 +473,7 @@ xfs_qm_dquot_isolate( /* * Prevent lookups now that we are past the point of no return. */ - dqp->dq_flags |= XFS_DQ_FREEING; + dqp->q_flags |= XFS_DQFLAG_FREEING; xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); @@ -541,31 +545,29 @@ xfs_qm_shrink_count( STATIC void xfs_qm_set_defquota( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, struct xfs_quotainfo *qinf) { struct xfs_dquot *dqp; struct xfs_def_quota *defq; - struct xfs_disk_dquot *ddqp; int error; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; - ddqp = &dqp->q_core; - defq = xfs_get_defquota(dqp, qinf); + defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp)); /* * Timers and warnings have been already set, let's just set the * default limits for this quota type */ - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + defq->blk.hard = dqp->q_blk.hardlimit; + defq->blk.soft = dqp->q_blk.softlimit; + defq->ino.hard = dqp->q_ino.hardlimit; + defq->ino.soft = dqp->q_ino.softlimit; + defq->rtb.hard = dqp->q_rtb.hardlimit; + defq->rtb.soft = dqp->q_rtb.softlimit; xfs_qm_dqdestroy(dqp); } @@ -573,19 +575,21 @@ xfs_qm_set_defquota( static void xfs_qm_init_timelimits( struct xfs_mount *mp, - struct xfs_quotainfo *qinf) + xfs_dqtype_t type) { - struct xfs_disk_dquot *ddqp; + struct xfs_quotainfo *qinf = mp->m_quotainfo; + struct xfs_def_quota *defq; struct xfs_dquot *dqp; - uint type; int error; - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + defq = xfs_get_defquota(qinf, type); + + defq->blk.time = XFS_QM_BTIMELIMIT; + defq->ino.time = XFS_QM_ITIMELIMIT; + defq->rtb.time = XFS_QM_RTBTIMELIMIT; + defq->blk.warn = XFS_QM_BWARNLIMIT; + defq->ino.warn = XFS_QM_IWARNLIMIT; + defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -593,39 +597,28 @@ xfs_qm_init_timelimits( * * Since we may not have done a quotacheck by this point, just read * the dquot without attaching it to any hashtables or lists. - * - * Timers and warnings are globally set by the first timer found in - * user/group/proj quota types, otherwise a default value is used. - * This should be split into different fields per quota type. */ - if (XFS_IS_UQUOTA_RUNNING(mp)) - type = XFS_DQ_USER; - else if (XFS_IS_GQUOTA_RUNNING(mp)) - type = XFS_DQ_GROUP; - else - type = XFS_DQ_PROJ; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; - ddqp = &dqp->q_core; /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ - if (ddqp->d_btimer) - qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); - if (ddqp->d_itimer) - qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); - if (ddqp->d_rtbtimer) - qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); - if (ddqp->d_bwarns) - qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); - if (ddqp->d_iwarns) - qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); - if (ddqp->d_rtbwarns) - qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + if (dqp->q_blk.timer) + defq->blk.time = dqp->q_blk.timer; + if (dqp->q_ino.timer) + defq->ino.time = dqp->q_ino.timer; + if (dqp->q_rtb.timer) + defq->rtb.time = dqp->q_rtb.timer; + if (dqp->q_blk.warnings) + defq->blk.warn = dqp->q_blk.warnings; + if (dqp->q_ino.warnings) + defq->ino.warn = dqp->q_ino.warnings; + if (dqp->q_rtb.warnings) + defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } @@ -671,14 +664,16 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - xfs_qm_init_timelimits(mp, qinf); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ); if (XFS_IS_UQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); if (XFS_IS_GQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); if (XFS_IS_PQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; @@ -776,7 +771,8 @@ xfs_qm_qino_alloc( } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0, + 0, 0, &tp); if (error) return error; @@ -827,14 +823,13 @@ xfs_qm_qino_alloc( STATIC void xfs_qm_reset_dqcounts( - xfs_mount_t *mp, - xfs_buf_t *bp, - xfs_dqid_t id, - uint type) + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_dqid_t id, + xfs_dqtype_t type) { struct xfs_dqblk *dqb; int j; - xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -859,24 +854,32 @@ xfs_qm_reset_dqcounts( * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ - fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); - if (fa) + if (xfs_dqblk_verify(mp, &dqb[j], id + j) || + (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for * project quotas or vice versa */ - ddq->d_flags = type; + ddq->d_type = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; - ddq->d_btimer = 0; - ddq->d_itimer = 0; - ddq->d_rtbtimer = 0; - ddq->d_bwarns = 0; - ddq->d_iwarns = 0; - ddq->d_rtbwarns = 0; + + /* + * dquot id 0 stores the default grace period and the maximum + * warning limit that were set by the administrator, so we + * should not reset them. + */ + if (ddq->d_id != 0) { + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + } if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], @@ -892,17 +895,13 @@ xfs_qm_reset_dqcounts_all( xfs_dqid_t firstid, xfs_fsblock_t bno, xfs_filblks_t blkcnt, - uint flags, + xfs_dqtype_t type, struct list_head *buffer_list) { struct xfs_buf *bp; - int error; - int type; + int error = 0; ASSERT(blkcnt > 0); - type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : - (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); - error = 0; /* * Blkcnt arg can be a very big number, and might even be @@ -962,7 +961,7 @@ STATIC int xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, - uint flags, + xfs_dqtype_t type, struct list_head *buffer_list) { struct xfs_bmbt_irec *map; @@ -1038,7 +1037,7 @@ xfs_qm_reset_dqcounts_buf( error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, - flags, buffer_list); + type, buffer_list); if (error) goto out; } @@ -1060,7 +1059,7 @@ out: STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, - uint type, + xfs_dqtype_t type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { @@ -1086,15 +1085,15 @@ xfs_qm_quotacheck_dqadjust( * Adjust the inode count and the block count to reflect this inode's * resource usage. */ - be64_add_cpu(&dqp->q_core.d_icount, 1); - dqp->q_res_icount++; + dqp->q_ino.count++; + dqp->q_ino.reserved++; if (nblks) { - be64_add_cpu(&dqp->q_core.d_bcount, nblks); - dqp->q_res_bcount += nblks; + dqp->q_blk.count += nblks; + dqp->q_blk.reserved += nblks; } if (rtblks) { - be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); - dqp->q_res_rtbcount += rtblks; + dqp->q_rtb.count += rtblks; + dqp->q_rtb.reserved += rtblks; } /* @@ -1102,12 +1101,12 @@ xfs_qm_quotacheck_dqadjust( * * There are no timers for the default values set in the root dquot. */ - if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(mp, dqp); - xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + if (dqp->q_id) { + xfs_qm_adjust_dqlimits(dqp); + xfs_qm_adjust_dqtimers(dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_qm_dqput(dqp); return 0; } @@ -1177,21 +1176,21 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks, rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks, rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks, rtblks); if (error) goto error0; @@ -1213,7 +1212,7 @@ xfs_qm_flush_one( int error = 0; xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) + if (dqp->q_flags & XFS_DQFLAG_FREEING) goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; @@ -1282,7 +1281,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER, &buffer_list); if (error) goto error_return; @@ -1290,7 +1289,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP, &buffer_list); if (error) goto error_return; @@ -1298,7 +1297,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ, &buffer_list); if (error) goto error_return; @@ -1315,17 +1314,17 @@ xfs_qm_quotacheck( * down to disk buffers if everything was updated successfully. */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one, &buffer_list); } if (XFS_IS_GQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; @@ -1588,8 +1587,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), - be32_to_cpu(dqp->q_core.d_id)); + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; mutex_unlock(&qi->qi_tree_lock); @@ -1613,8 +1611,8 @@ xfs_qm_dqfree_one( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - xfs_dqid_t uid, - xfs_dqid_t gid, + kuid_t uid, + kgid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, @@ -1622,6 +1620,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + struct user_namespace *user_ns = inode->i_sb->s_user_ns; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; @@ -1635,7 +1635,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) - gid = ip->i_d.di_gid; + gid = inode->i_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation @@ -1650,7 +1650,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_d.di_uid != uid) { + if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode @@ -1661,7 +1661,8 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); + error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), + XFS_DQTYPE_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1682,9 +1683,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { - if (ip->i_d.di_gid != gid) { + if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); + error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), + XFS_DQTYPE_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1700,8 +1702,8 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, - true, &pq); + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, + XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1714,8 +1716,7 @@ xfs_qm_vop_dqalloc( pq = xfs_qm_dqhold(ip->i_pdquot); } } - if (uq) - trace_xfs_dquot_dqalloc(ip); + trace_xfs_dquot_dqalloc(ip); xfs_iunlock(ip, lockflags); if (O_udqpp) @@ -1792,7 +1793,7 @@ xfs_qm_vop_chown_reserve( { struct xfs_mount *mp = ip->i_mount; uint64_t delblks; - unsigned int blkflags, prjflags = 0; + unsigned int blkflags; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; @@ -1810,7 +1811,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != udqp->q_id) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1823,7 +1824,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != gdqp->q_id) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1832,8 +1833,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { - prjflags = XFS_QMOPT_ENOSPC; + ip->i_d.di_projid != pdqp->q_id) { pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); @@ -1843,8 +1843,7 @@ xfs_qm_vop_chown_reserve( error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags); + ip->i_d.di_nblocks, 1, flags | blkflags); if (error) return error; @@ -1862,8 +1861,7 @@ xfs_qm_vop_chown_reserve( ASSERT(udq_unres || gdq_unres || pdq_unres); error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, - (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags); + (xfs_qcnt_t)delblks, 0, flags | blkflags); if (error) return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, @@ -1916,24 +1914,24 @@ xfs_qm_vop_create_dqattach( return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id); + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); + ASSERT(ip->i_d.di_projid == pdqp->q_id); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 4e57edca8bce..9c078c35d924 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,34 +20,28 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; #define XFS_DQITER_MAP_SIZE 10 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -/* - * This defines the unit of allocation of dquots. - * Currently, it is just one file system block, and a 4K blk contains 30 - * (136 * 30 = 4080) dquots. It's probably not worth trying to make - * this more dynamic. - * XXXsup However, if this number is changed, we have to make sure that we don't - * implicitly assume that we do allocations in chunks of a single filesystem - * block in the dquot/xqm code. - */ -#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + !dqp->q_blk.hardlimit && \ + !dqp->q_blk.softlimit && \ + !dqp->q_rtb.hardlimit && \ + !dqp->q_rtb.softlimit && \ + !dqp->q_ino.hardlimit && \ + !dqp->q_ino.softlimit && \ + !dqp->q_blk.count && \ + !dqp->q_rtb.count && \ + !dqp->q_ino.count) + +struct xfs_quota_limits { + xfs_qcnt_t hard; /* default hard limit */ + xfs_qcnt_t soft; /* default soft limit */ + time64_t time; /* limit for timers */ + xfs_qwarncnt_t warn; /* limit for warnings */ +}; +/* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { - xfs_qcnt_t bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ - xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ + struct xfs_quota_limits blk; + struct xfs_quota_limits ino; + struct xfs_quota_limits rtb; }; /* @@ -55,41 +49,35 @@ struct xfs_def_quota { * The mount structure keeps a pointer to this. */ struct xfs_quotainfo { - struct radix_tree_root qi_uquota_tree; - struct radix_tree_root qi_gquota_tree; - struct radix_tree_root qi_pquota_tree; - struct mutex qi_tree_lock; + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; + struct mutex qi_tree_lock; struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ - struct list_lru qi_lru; - int qi_dquots; - time64_t qi_btimelimit; /* limit for blks timer */ - time64_t qi_itimelimit; /* limit for inodes timer */ - time64_t qi_rtbtimelimit;/* limit for rt blks timer */ - xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ - xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ - xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ - struct mutex qi_quotaofflock;/* to serialize quotaoff */ - xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ - uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + struct list_lru qi_lru; + int qi_dquots; + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dq in above chunk */ struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker qi_shrinker; }; static inline struct radix_tree_root * xfs_dquot_tree( struct xfs_quotainfo *qi, - int type) + xfs_dqtype_t type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return &qi->qi_uquota_tree; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return &qi->qi_gquota_tree; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return &qi->qi_pquota_tree; default: ASSERT(0); @@ -98,14 +86,14 @@ xfs_dquot_tree( } static inline struct xfs_inode * -xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) +xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) { - switch (dq_flags & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (type) { + case XFS_DQTYPE_USER: return mp->m_quotainfo->qi_uquotaip; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return mp->m_quotainfo->qi_gquotaip; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return mp->m_quotainfo->qi_pquotaip; default: ASSERT(0); @@ -154,29 +142,35 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, - uint, struct qc_dqblk *); -extern int xfs_qm_scall_getquota_next(struct xfs_mount *, - xfs_dqid_t *, uint, struct qc_dqblk *); -extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, - struct qc_dqblk *); +extern int xfs_qm_scall_getquota(struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct qc_dqblk *dst); +extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp, + xfs_dqid_t *id, + xfs_dqtype_t type, + struct qc_dqblk *dst); +extern int xfs_qm_scall_setqlim(struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct qc_dqblk *newlim); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); static inline struct xfs_def_quota * -xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi) +xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type) { - struct xfs_def_quota *defq; - - if (XFS_QM_ISUDQ(dqp)) - defq = &qi->qi_usr_default; - else if (XFS_QM_ISGDQ(dqp)) - defq = &qi->qi_grp_default; - else { - ASSERT(XFS_QM_ISPDQ(dqp)); - defq = &qi->qi_prj_default; + switch (type) { + case XFS_DQTYPE_USER: + return &qi->qi_usr_default; + case XFS_DQTYPE_GROUP: + return &qi->qi_grp_default; + case XFS_DQTYPE_PROJ: + return &qi->qi_prj_default; + default: + ASSERT(0); + return NULL; } - return defq; } #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index fc2fa418919f..639398091ad6 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -23,24 +23,24 @@ xfs_fill_statvfs_from_dquot( { uint64_t limit; - limit = dqp->q_core.d_blk_softlimit ? - be64_to_cpu(dqp->q_core.d_blk_softlimit) : - be64_to_cpu(dqp->q_core.d_blk_hardlimit); + limit = dqp->q_blk.softlimit ? + dqp->q_blk.softlimit : + dqp->q_blk.hardlimit; if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = - (statp->f_blocks > dqp->q_res_bcount) ? - (statp->f_blocks - dqp->q_res_bcount) : 0; + (statp->f_blocks > dqp->q_blk.reserved) ? + (statp->f_blocks - dqp->q_blk.reserved) : 0; } - limit = dqp->q_core.d_ino_softlimit ? - be64_to_cpu(dqp->q_core.d_ino_softlimit) : - be64_to_cpu(dqp->q_core.d_ino_hardlimit); + limit = dqp->q_ino.softlimit ? + dqp->q_ino.softlimit : + dqp->q_ino.hardlimit; if (limit && statp->f_files > limit) { statp->f_files = limit; statp->f_ffree = - (statp->f_files > dqp->q_res_icount) ? - (statp->f_files - dqp->q_res_icount) : 0; + (statp->f_files > dqp->q_ino.reserved) ? + (statp->f_files - dqp->q_ino.reserved) : 0; } } @@ -60,7 +60,7 @@ xfs_qm_statvfs( struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1ea82764bf89..1c542b4a5220 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -29,8 +29,6 @@ xfs_qm_log_quotaoff( int error; struct xfs_qoff_logitem *qoffi; - *qoffstartp = NULL; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); if (error) goto out; @@ -62,7 +60,7 @@ out: STATIC int xfs_qm_log_quotaoff_end( struct xfs_mount *mp, - struct xfs_qoff_logitem *startqoff, + struct xfs_qoff_logitem **startqoff, uint flags) { struct xfs_trans *tp; @@ -73,9 +71,10 @@ xfs_qm_log_quotaoff_end( if (error) return error; - qoffi = xfs_trans_get_qoff_item(tp, startqoff, + qoffi = xfs_trans_get_qoff_item(tp, *startqoff, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); + *startqoff = NULL; /* * We have to make sure that the transaction is secure on disk before we @@ -103,7 +102,7 @@ xfs_qm_scall_quotaoff( uint dqtype; int error; uint inactivate_flags; - struct xfs_qoff_logitem *qoffstart; + struct xfs_qoff_logitem *qoffstart = NULL; /* * No file system can have quotas enabled on disk but not in core. @@ -228,7 +227,7 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -261,6 +260,8 @@ xfs_qm_scall_quotaoff( } out_unlock: + if (error && qoffstart) + xfs_qm_qoff_logitem_relse(qoffstart); mutex_unlock(&q->qi_quotaofflock); return error; } @@ -301,7 +302,7 @@ xfs_qm_scall_trunc_qfile( goto out_unlock; } - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp); @@ -321,23 +322,23 @@ xfs_qm_scall_trunc_qfiles( int error = -EINVAL; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || - (flags & ~XFS_DQ_ALLTYPES)) { + (flags & ~XFS_QMOPT_QUOTALL)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); return -EINVAL; } - if (flags & XFS_DQ_USER) { + if (flags & XFS_QMOPT_UQUOTA) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); if (error) return error; } - if (flags & XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_GQUOTA) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); if (error) return error; } - if (flags & XFS_DQ_PROJ) + if (flags & XFS_QMOPT_PQUOTA) error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); return error; @@ -356,11 +357,11 @@ xfs_qm_scall_quotaon( int error; uint qf; - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* - * Switching on quota accounting must be done at mount time. + * Switching on quota accounting must be done at mount time, + * only consider quota enforcement stuff here. */ - flags &= ~(XFS_ALL_QUOTA_ACCT); + flags &= XFS_ALL_QUOTA_ENFD; if (flags == 0) { xfs_debug(mp, "%s: zero flags, m_qflags=%x", @@ -436,20 +437,73 @@ xfs_qm_scall_quotaon( (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) /* + * Adjust limits of this quota, and the defaults if passed in. Returns true + * if the new limits made sense and were applied, false otherwise. + */ +static inline bool +xfs_setqlim_limits( + struct xfs_mount *mp, + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + xfs_qcnt_t hard, + xfs_qcnt_t soft, + const char *tag) +{ + /* The hard limit can't be less than the soft limit. */ + if (hard != 0 && hard < soft) { + xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag, + soft); + return false; + } + + res->hardlimit = hard; + res->softlimit = soft; + if (qlim) { + qlim->hard = hard; + qlim->soft = soft; + } + + return true; +} + +static inline void +xfs_setqlim_warns( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + int warns) +{ + res->warnings = warns; + if (qlim) + qlim->warn = warns; +} + +static inline void +xfs_setqlim_timer( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + s64 timer) +{ + res->timer = timer; + if (qlim) + qlim->time = timer; +} + +/* * Adjust quota limits, and start/stop timers accordingly. */ int xfs_qm_scall_setqlim( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_disk_dquot *ddq; struct xfs_dquot *dqp; struct xfs_trans *tp; struct xfs_def_quota *defq; + struct xfs_dquot_res *res; + struct xfs_quota_limits *qlim; int error; xfs_qcnt_t hard, soft; @@ -478,7 +532,7 @@ xfs_qm_scall_setqlim( goto out_unlock; } - defq = xfs_get_defquota(dqp, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); xfs_dqunlock(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); @@ -487,99 +541,74 @@ xfs_qm_scall_setqlim( xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); - ddq = &dqp->q_core; /* + * Update quota limits, warnings, and timers, and the defaults + * if we're touching id == 0. + * * Make sure that hardlimits are >= soft limits before changing. + * + * Update warnings counter(s) if requested. + * + * Timelimits for the super user set the relative time the other users + * can be over quota for this file system. If it is zero a default is + * used. Ditto for the default soft and hard limit values (already + * done, above), and for warnings. + * + * For other IDs, userspace can bump out the grace period if over + * the soft limit. */ + + /* Blocks on the data device. */ hard = (newlim->d_fieldmask & QC_SPC_HARD) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : - be64_to_cpu(ddq->d_blk_hardlimit); + dqp->q_blk.hardlimit; soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : - be64_to_cpu(ddq->d_blk_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_blk_hardlimit = cpu_to_be64(hard); - ddq->d_blk_softlimit = cpu_to_be64(soft); + dqp->q_blk.softlimit; + res = &dqp->q_blk; + qlim = id == 0 ? &defq->blk : NULL; + + if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); - if (id == 0) { - defq->bhardlimit = hard; - defq->bsoftlimit = soft; - } - } else { - xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); - } + if (newlim->d_fieldmask & QC_SPC_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); + if (newlim->d_fieldmask & QC_SPC_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + + /* Blocks on the realtime device. */ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : - be64_to_cpu(ddq->d_rtb_hardlimit); + dqp->q_rtb.hardlimit; soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : - be64_to_cpu(ddq->d_rtb_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_rtb_hardlimit = cpu_to_be64(hard); - ddq->d_rtb_softlimit = cpu_to_be64(soft); - if (id == 0) { - defq->rtbhardlimit = hard; - defq->rtbsoftlimit = soft; - } - } else { - xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); - } + dqp->q_rtb.softlimit; + res = &dqp->q_rtb; + qlim = id == 0 ? &defq->rtb : NULL; + + xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + /* Inodes */ hard = (newlim->d_fieldmask & QC_INO_HARD) ? (xfs_qcnt_t) newlim->d_ino_hardlimit : - be64_to_cpu(ddq->d_ino_hardlimit); + dqp->q_ino.hardlimit; soft = (newlim->d_fieldmask & QC_INO_SOFT) ? (xfs_qcnt_t) newlim->d_ino_softlimit : - be64_to_cpu(ddq->d_ino_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_ino_hardlimit = cpu_to_be64(hard); - ddq->d_ino_softlimit = cpu_to_be64(soft); - if (id == 0) { - defq->ihardlimit = hard; - defq->isoftlimit = soft; - } - } else { - xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); - } + dqp->q_ino.softlimit; + res = &dqp->q_ino; + qlim = id == 0 ? &defq->ino : NULL; - /* - * Update warnings counter(s) if requested - */ - if (newlim->d_fieldmask & QC_SPC_WARNS) - ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); + xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); if (newlim->d_fieldmask & QC_INO_WARNS) - ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); + xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); + if (newlim->d_fieldmask & QC_INO_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); - if (id == 0) { - /* - * Timelimits for the super user set the relative time - * the other users can be over quota for this file system. - * If it is zero a default is used. Ditto for the default - * soft and hard limit values (already done, above), and - * for warnings. - */ - if (newlim->d_fieldmask & QC_SPC_TIMER) { - q->qi_btimelimit = newlim->d_spc_timer; - ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); - } - if (newlim->d_fieldmask & QC_INO_TIMER) { - q->qi_itimelimit = newlim->d_ino_timer; - ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); - } - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) { - q->qi_rtbtimelimit = newlim->d_rt_spc_timer; - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); - } - if (newlim->d_fieldmask & QC_SPC_WARNS) - q->qi_bwarnlimit = newlim->d_spc_warns; - if (newlim->d_fieldmask & QC_INO_WARNS) - q->qi_iwarnlimit = newlim->d_ino_warns; - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - q->qi_rtbwarnlimit = newlim->d_rt_spc_warns; - } else { + if (id != 0) { /* * If the user is now over quota, start the timelimit. * The user will not be 'warned'. @@ -587,9 +616,9 @@ xfs_qm_scall_setqlim( * is on or off. We don't really want to bother with iterating * over all ondisk dquots and turning the timers on/off. */ - xfs_qm_adjust_dqtimers(mp, ddq); + xfs_qm_adjust_dqtimers(dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp); @@ -605,58 +634,46 @@ out_unlock: static void xfs_qm_scall_getquota_fill_qc( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, const struct xfs_dquot *dqp, struct qc_dqblk *dst) { memset(dst, 0, sizeof(*dst)); - dst->d_spc_hardlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); - dst->d_spc_softlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); - dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); - dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); - dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); - dst->d_ino_count = dqp->q_res_icount; - dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); - dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); - dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); - dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); - dst->d_rt_spc_hardlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); - dst->d_rt_spc_softlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); - dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); - dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); - dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit); + dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit); + dst->d_ino_hardlimit = dqp->q_ino.hardlimit; + dst->d_ino_softlimit = dqp->q_ino.softlimit; + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved); + dst->d_ino_count = dqp->q_ino.reserved; + dst->d_spc_timer = dqp->q_blk.timer; + dst->d_ino_timer = dqp->q_ino.timer; + dst->d_ino_warns = dqp->q_ino.warnings; + dst->d_spc_warns = dqp->q_blk.warnings; + dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); + dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); + dst->d_rt_spc_timer = dqp->q_rtb.timer; + dst->d_rt_spc_warns = dqp->q_rtb.warnings; /* * Internally, we don't reset all the timers when quota enforcement * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_USER) || - (!XFS_IS_GQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_GROUP) || - (!XFS_IS_PQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_PROJ)) { + if (!xfs_dquot_is_enforced(dqp)) { dst->d_spc_timer = 0; dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || - (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || - (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - dqp->q_core.d_id != 0) { + if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); } - if ((dst->d_ino_count > dst->d_ino_softlimit) && - (dst->d_ino_softlimit > 0)) { + if ((dst->d_ino_count > dqp->q_ino.softlimit) && + (dqp->q_ino.softlimit > 0)) { ASSERT(dst->d_ino_timer != 0); } } @@ -668,7 +685,7 @@ int xfs_qm_scall_getquota( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *dst) { struct xfs_dquot *dqp; @@ -706,7 +723,7 @@ int xfs_qm_scall_getquota_next( struct xfs_mount *mp, xfs_dqid_t *id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *dst) { struct xfs_dquot *dqp; @@ -717,7 +734,7 @@ xfs_qm_scall_getquota_next( return error; /* Fill in the ID we actually read from disk */ - *id = be32_to_cpu(dqp->q_core.d_id); + *id = dqp->q_id; xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); @@ -728,9 +745,10 @@ xfs_qm_scall_getquota_next( STATIC int xfs_dqrele_inode( struct xfs_inode *ip, - int flags, void *args) { + uint *flags = args; + /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || ip == ip->i_mount->m_quotainfo->qi_gquotaip || @@ -742,15 +760,15 @@ xfs_dqrele_inode( } xfs_ilock(ip, XFS_ILOCK_EXCL); - if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } @@ -767,10 +785,10 @@ xfs_dqrele_inode( */ void xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) + struct xfs_mount *mp, + uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, - XFS_AGITER_INEW_WAIT); + xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + &flags, XFS_ICI_NO_TAG); } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index efe42ae7a2f3..06b22e35fc90 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -13,6 +13,7 @@ */ struct xfs_trans; +struct xfs_buf; /* * This check is done typically without holding the inode lock; @@ -38,14 +39,14 @@ struct xfs_trans; static inline uint xfs_quota_chkd_flag( - uint dqtype) + xfs_dqtype_t type) { - switch (dqtype) { - case XFS_DQ_USER: + switch (type) { + case XFS_DQTYPE_USER: return XFS_UQUOTA_CHKD; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return XFS_GQUOTA_CHKD; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return XFS_PQUOTA_CHKD; default: return 0; @@ -86,7 +87,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, @@ -107,9 +108,11 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); +void xfs_dquot_done(struct xfs_buf *); + #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, +xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, prid_t prid, uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { @@ -148,6 +151,12 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) + +static inline void xfs_dquot_done(struct xfs_buf *bp) +{ + return; +} + #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 38669e827206..d27c0e852c0b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -21,10 +21,10 @@ xfs_qm_fill_state( struct qc_type_state *tstate, struct xfs_mount *mp, struct xfs_inode *ip, - xfs_ino_t ino) + xfs_ino_t ino, + struct xfs_def_quota *defq) { - struct xfs_quotainfo *q = mp->m_quotainfo; - bool tempqip = false; + bool tempqip = false; tstate->ino = ino; if (!ip && ino == NULLFSINO) @@ -36,13 +36,13 @@ xfs_qm_fill_state( } tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; - tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = (u32)q->qi_btimelimit; - tstate->ino_timelimit = (u32)q->qi_itimelimit; - tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; - tstate->spc_warnlimit = q->qi_bwarnlimit; - tstate->ino_warnlimit = q->qi_iwarnlimit; - tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; + tstate->nextents = ip->i_df.if_nextents; + tstate->spc_timelimit = (u32)defq->blk.time; + tstate->ino_timelimit = (u32)defq->ino.time; + tstate->rt_spc_timelimit = (u32)defq->rtb.time; + tstate->spc_warnlimit = defq->blk.warn; + tstate->ino_warnlimit = defq->ino.warn; + tstate->rt_spc_warnlimit = defq->rtb.warn; if (tempqip) xfs_irele(ip); } @@ -77,24 +77,24 @@ xfs_fs_get_quota_state( state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, - mp->m_sb.sb_uquotino); + mp->m_sb.sb_uquotino, &q->qi_usr_default); xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, - mp->m_sb.sb_gquotino); + mp->m_sb.sb_gquotino, &q->qi_grp_default); xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, - mp->m_sb.sb_pquotino); + mp->m_sb.sb_pquotino, &q->qi_prj_default); return 0; } -STATIC int +STATIC xfs_dqtype_t xfs_quota_type(int type) { switch (type) { case USRQUOTA: - return XFS_DQ_USER; + return XFS_DQTYPE_USER; case GRPQUOTA: - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; default: - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; } } @@ -109,8 +109,8 @@ xfs_fs_set_info( int type, struct qc_info *info) { - struct xfs_mount *mp = XFS_M(sb); - struct qc_dqblk newlim; + struct xfs_mount *mp = XFS_M(sb); + struct qc_dqblk newlim; if (sb_rdonly(sb)) return -EROFS; @@ -205,11 +205,11 @@ xfs_fs_rm_xquota( return -EINVAL; if (uflags & FS_USER_QUOTA) - flags |= XFS_DQ_USER; + flags |= XFS_QMOPT_UQUOTA; if (uflags & FS_GROUP_QUOTA) - flags |= XFS_DQ_GROUP; + flags |= XFS_QMOPT_GQUOTA; if (uflags & FS_PROJ_QUOTA) - flags |= XFS_DQ_PROJ; + flags |= XFS_QMOPT_PQUOTA; return xfs_qm_scall_trunc_qfiles(mp, flags); } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8eeed73928cd..ca93b6488377 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -18,16 +18,20 @@ #include "xfs_log.h" #include "xfs_refcount.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_cui_zone; kmem_zone_t *xfs_cud_zone; +static const struct xfs_item_ops xfs_cui_item_ops; + static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cui_log_item, cui_item); } -void +STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { @@ -44,13 +48,13 @@ xfs_cui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the CUI. */ -void +STATIC void xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); xfs_cui_item_free(cuip); } } @@ -123,17 +127,10 @@ xfs_cui_item_release( xfs_cui_release(CUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_cui_item_ops = { - .iop_size = xfs_cui_item_size, - .iop_format = xfs_cui_item_format, - .iop_unpin = xfs_cui_item_unpin, - .iop_release = xfs_cui_item_release, -}; - /* * Allocate and initialize an cui item with the given number of extents. */ -struct xfs_cui_log_item * +STATIC struct xfs_cui_log_item * xfs_cui_init( struct xfs_mount *mp, uint nextents) @@ -146,7 +143,8 @@ xfs_cui_init( cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 0); else - cuip = kmem_zone_zalloc(xfs_cui_zone, 0); + cuip = kmem_cache_zalloc(xfs_cui_zone, + GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; @@ -223,7 +221,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_zone_zalloc(xfs_cud_zone, 0); + cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; @@ -284,27 +282,6 @@ xfs_refcount_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_startblock); } -/* Get an CUI. */ -STATIC void * -xfs_refcount_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_cui_log_item *cuip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - cuip = xfs_cui_init(tp->t_mountp, count); - ASSERT(cuip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &cuip->cui_item); - return cuip; -} - /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( @@ -328,16 +305,12 @@ xfs_trans_set_refcount_flags( STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_cui_log_item *cuip, + struct xfs_refcount_intent *refc) { - struct xfs_cui_log_item *cuip = intent; - struct xfs_refcount_intent *refc; uint next_extent; struct xfs_phys_extent *ext; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -354,23 +327,44 @@ xfs_refcount_update_log_item( xfs_trans_set_refcount_flags(ext, refc->ri_type); } +static struct xfs_log_item * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_refcount_intent *refc; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &cuip->cui_item); + if (sort) + list_sort(mp, items, xfs_refcount_update_diff_items); + list_for_each_entry(refc, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, refc); + return &cuip->cui_item; +} + /* Get an CUD so we can process all the deferred refcount updates. */ -STATIC void * +static struct xfs_log_item * xfs_refcount_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_cud(tp, intent); + return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; } /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_refcount_intent *refc; xfs_fsblock_t new_fsb; @@ -378,12 +372,10 @@ xfs_refcount_update_finish_item( int error; refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, done_item, - refc->ri_type, - refc->ri_startblock, - refc->ri_blockcount, - &new_fsb, &new_aglen, - (struct xfs_btree_cur **)state); + error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), + refc->ri_type, refc->ri_startblock, refc->ri_blockcount, + &new_fsb, &new_aglen, state); + /* Did we run out of reservation? Requeue what we didn't finish. */ if (!error && new_aglen > 0) { ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || @@ -396,24 +388,12 @@ xfs_refcount_update_finish_item( return error; } -/* Clean up after processing deferred refcounts. */ -STATIC void -xfs_refcount_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_refcount_finish_one_cleanup(tp, rcur, error); -} - /* Abort all pending CUIs. */ STATIC void xfs_refcount_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_cui_release(intent); + xfs_cui_release(CUI_ITEM(intent)); } /* Cancel a deferred refcount update. */ @@ -429,13 +409,11 @@ xfs_refcount_update_cancel_item( const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, - .log_item = xfs_refcount_update_log_item, .create_done = xfs_refcount_update_create_done, .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_update_finish_cleanup, + .finish_cleanup = xfs_refcount_finish_one_cleanup, .cancel_item = xfs_refcount_update_cancel_item, }; @@ -443,28 +421,27 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. */ -int -xfs_cui_recover( - struct xfs_trans *parent_tp, - struct xfs_cui_log_item *cuip) +STATIC int +xfs_cui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int i; - int error = 0; - unsigned int refc_type; + struct xfs_bmbt_irec irec; + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_phys_extent *refc; - xfs_fsblock_t startblock_fsb; - bool op_ok; struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - enum xfs_refcount_intent_type type; + struct xfs_mount *mp = parent_tp->t_mountp; + xfs_fsblock_t startblock_fsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; - struct xfs_bmbt_irec irec; + unsigned int refc_type; + bool op_ok; bool requeue_only = false; - struct xfs_mount *mp = parent_tp->t_mountp; - - ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + enum xfs_refcount_intent_type type; + int i; + int error = 0; /* * First check the validity of the extents described by the @@ -495,7 +472,6 @@ xfs_cui_recover( * This will pull the CUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_cui_release(cuip); return -EFSCORRUPTED; } @@ -509,7 +485,7 @@ xfs_cui_recover( * transaction. Normally, any work that needs to be deferred * gets attached to the same defer_ops that scheduled the * refcount update. However, we're in log recovery here, so we - * we use the passed in defer_ops and to finish up any work that + * use the passed in defer_ops and to finish up any work that * doesn't fit. We need to reserve enough blocks to handle a * full btree split on either end of the refcount range. */ @@ -579,7 +555,6 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_defer_move(parent_tp, tp); error = xfs_trans_commit(tp); return error; @@ -590,3 +565,117 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_unpin = xfs_cui_item_unpin, + .iop_release = xfs_cui_item_release, + .iop_recover = xfs_cui_item_recover, + .iop_match = xfs_cui_item_match, +}; + +/* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + +const struct xlog_recover_item_ops xlog_cui_item_ops = { + .item_type = XFS_LI_CUI, + .commit_pass2 = xlog_recover_cui_commit_pass2, +}; + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_format *cud_formatp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_CUI, cud_formatp->cud_cui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_cud_item_ops = { + .item_type = XFS_LI_CUD, + .commit_pass2 = xlog_recover_cud_commit_pass2, +}; diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index e47530f30489..f4f2e836540b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -33,11 +33,6 @@ struct kmem_zone; #define XFS_CUI_MAX_FAST_EXTENTS 16 /* - * Define CUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_CUI_RECOVERED 1 - -/* * This is the "refcount update intent" log item. It is used to log * the fact that some reverse mappings need to change. It is used in * conjunction with the "refcount update done" log item described @@ -51,7 +46,6 @@ struct xfs_cui_log_item { struct xfs_log_item cui_item; atomic_t cui_refcount; atomic_t cui_next_extent; - unsigned long cui_flags; /* misc flags */ struct xfs_cui_log_format cui_format; }; @@ -77,9 +71,4 @@ struct xfs_cud_log_item { extern struct kmem_zone *xfs_cui_zone; extern struct kmem_zone *xfs_cud_zone; -struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); -void xfs_cui_item_free(struct xfs_cui_log_item *); -void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip); - #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index b0ce04ffd3cd..16098dc42add 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -179,7 +179,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -655,7 +655,7 @@ xfs_reflink_end_cow_extent( * preallocations can leak into the range we are called upon, and we * need to skip them. */ - if (!xfs_bmap_is_real_extent(&got)) { + if (!xfs_bmap_is_written_extent(&got)) { *end_fsb = del.br_startoff; goto out_cancel; } @@ -721,7 +721,7 @@ xfs_reflink_end_cow( * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * - * If we're being called by writeback then the the pages will still + * If we're being called by writeback then the pages will still * have PageWriteback set, which prevents races with reflink remapping * and truncate. Reflink remapping prevents races with writeback by * taking the iolock and mmaplock before flushing the pages and @@ -984,40 +984,28 @@ xfs_reflink_ag_has_free_space( } /* - * Unmap a range of blocks from a file, then map other blocks into the hole. - * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). - * The extent irec is mapped into dest at irec->br_startoff. + * Remap the given extent into the file. The dmap blockcount will be set to + * the number of blocks that were actually remapped. */ STATIC int xfs_reflink_remap_extent( struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, - xfs_fileoff_t destoff, + struct xfs_bmbt_irec *dmap, xfs_off_t new_isize) { + struct xfs_bmbt_irec smap; struct xfs_mount *mp = ip->i_mount; - bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; - unsigned int resblks; - struct xfs_bmbt_irec uirec; - xfs_filblks_t rlen; - xfs_filblks_t unmap_len; xfs_off_t newlen; + int64_t qres, qdelta; + unsigned int resblks; + bool smap_real; + bool dmap_written = xfs_bmap_is_written_extent(dmap); + int nimaps; int error; - unmap_len = irec->br_startoff + irec->br_blockcount - destoff; - trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - - /* No reflinking if we're low on space */ - if (real_extent) { - error = xfs_reflink_ag_has_free_space(mp, - XFS_FSB_TO_AGNO(mp, irec->br_startblock)); - if (error) - goto out; - } - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out; @@ -1025,86 +1013,147 @@ xfs_reflink_remap_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* If we're not just clearing space, then do we have enough quota? */ - if (real_extent) { - error = xfs_trans_reserve_quota_nblks(tp, ip, - irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); + /* + * Read what's currently mapped in the destination file into smap. + * If smap isn't a hole, we will have to remove it before we can add + * dmap to the destination file. + */ + nimaps = 1; + error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, + &smap, &nimaps, 0); + if (error) + goto out_cancel; + ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); + smap_real = xfs_bmap_is_real_extent(&smap); + + /* + * We can only remap as many blocks as the smaller of the two extent + * maps, because we can only remap one extent at a time. + */ + dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); + ASSERT(dmap->br_blockcount == smap.br_blockcount); + + trace_xfs_reflink_remap_extent_dest(ip, &smap); + + /* + * Two extents mapped to the same physical block must not have + * different states; that's filesystem corruption. Move on to the next + * extent if they're both holes or both the same physical extent. + */ + if (dmap->br_startblock == smap.br_startblock) { + if (dmap->br_state != smap.br_state) + error = -EFSCORRUPTED; + goto out_cancel; + } + + /* If both extents are unwritten, leave them alone. */ + if (dmap->br_state == XFS_EXT_UNWRITTEN && + smap.br_state == XFS_EXT_UNWRITTEN) + goto out_cancel; + + /* No reflinking if the AG of the dest mapping is low on space. */ + if (dmap_written) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); if (error) goto out_cancel; } - trace_xfs_reflink_remap(ip, irec->br_startoff, - irec->br_blockcount, irec->br_startblock); - - /* Unmap the old blocks in the data fork. */ - rlen = unmap_len; - while (rlen) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); + /* + * Compute quota reservation if we think the quota block counter for + * this file could increase. + * + * Adding a written extent to the extent map can cause a bmbt split, + * and removing a mapped extent from the extent can cause a bmbt split. + * The two operations cannot both cause a split since they operate on + * the same index in the bmap btree, so we only need a reservation for + * one bmbt split if either thing is happening. + * + * If we are mapping a written extent into the file, we need to have + * enough quota block count reservation to handle the blocks in that + * extent. We log only the delta to the quota block counts, so if the + * extent we're unmapping also has blocks allocated to it, we don't + * need a quota reservation for the extent itself. + * + * Note that if we're replacing a delalloc reservation with a written + * extent, we have to take the full quota reservation because removing + * the delalloc reservation gives the block count back to the quota + * count. This is suboptimal, but the VFS flushed the dest range + * before we started. That should have removed all the delalloc + * reservations, but we code defensively. + */ + qres = qdelta = 0; + if (smap_real || dmap_written) + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (!smap_real && dmap_written) + qres += dmap->br_blockcount; + if (qres > 0) { + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_cancel; + } + if (smap_real) { /* - * Trim the extent to whatever got unmapped. - * Remember, bunmapi works backwards. + * If the extent we're unmapping is backed by storage (written + * or not), unmap the extent and drop its refcount. */ - uirec.br_startblock = irec->br_startblock + rlen; - uirec.br_startoff = irec->br_startoff + rlen; - uirec.br_blockcount = unmap_len - rlen; - unmap_len = rlen; - - /* If this isn't a real mapping, we're done. */ - if (!real_extent || uirec.br_blockcount == 0) - goto next_extent; + xfs_bmap_unmap_extent(tp, ip, &smap); + xfs_refcount_decrease_extent(tp, &smap); + qdelta -= smap.br_blockcount; + } else if (smap.br_startblock == DELAYSTARTBLOCK) { + xfs_filblks_t len = smap.br_blockcount; - trace_xfs_reflink_remap(ip, uirec.br_startoff, - uirec.br_blockcount, uirec.br_startblock); - - /* Update the refcount tree */ - xfs_refcount_increase_extent(tp, &uirec); - - /* Map the new blocks into the data fork. */ - xfs_bmap_map_extent(tp, ip, &uirec); + /* + * If the extent we're unmapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + if (error) + goto out_cancel; + ASSERT(len == 0); + } - /* Update quota accounting. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, - uirec.br_blockcount); + /* + * If the extent we're sharing is backed by written storage, increase + * its refcount and map it into the file. + */ + if (dmap_written) { + xfs_refcount_increase_extent(tp, dmap); + xfs_bmap_map_extent(tp, ip, dmap); + qdelta += dmap->br_blockcount; + } - /* Update dest isize if needed. */ - newlen = XFS_FSB_TO_B(mp, - uirec.br_startoff + uirec.br_blockcount); - newlen = min_t(xfs_off_t, newlen, new_isize); - if (newlen > i_size_read(VFS_I(ip))) { - trace_xfs_reflink_update_inode_size(ip, newlen); - i_size_write(VFS_I(ip), newlen); - ip->i_d.di_size = newlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); -next_extent: - /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp); - if (error) - goto out_cancel; + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } + /* Commit everything and unlock. */ error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto out; - return 0; + goto out_unlock; out_cancel: xfs_trans_cancel(tp); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: - trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); return error; } -/* - * Iteratively remap one file's extents (and holes) to another's. - */ +/* Remap a range of one file to the other. */ int xfs_reflink_remap_blocks( struct xfs_inode *src, @@ -1115,25 +1164,22 @@ xfs_reflink_remap_blocks( loff_t *remapped) { struct xfs_bmbt_irec imap; - xfs_fileoff_t srcoff; - xfs_fileoff_t destoff; + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); + xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); xfs_filblks_t len; - xfs_filblks_t range_len; xfs_filblks_t remapped_len = 0; xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); - srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); - len = XFS_B_TO_FSB(src->i_mount, remap_len); + len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), + XFS_MAX_FILEOFF); - /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ - while (len) { - uint lock_mode; + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); - trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, - dest, destoff); + while (len > 0) { + unsigned int lock_mode; /* Read extent from the source file */ nimaps = 1; @@ -1142,18 +1188,25 @@ xfs_reflink_remap_blocks( xfs_iunlock(src, lock_mode); if (error) break; - ASSERT(nimaps == 1); - - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, - &imap); + /* + * The caller supposedly flushed all dirty pages in the source + * file range, which means that writeback should have allocated + * or deleted all delalloc reservations in that range. If we + * find one, that's a good sign that something is seriously + * wrong here. + */ + ASSERT(nimaps == 1 && imap.br_startoff == srcoff); + if (imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = -EFSCORRUPTED; + break; + } - /* Translate imap into the destination file. */ - range_len = imap.br_startoff + imap.br_blockcount - srcoff; - imap.br_startoff += destoff - srcoff; + trace_xfs_reflink_remap_extent_src(src, &imap); - /* Clear dest from destoff to the end of imap and map it in. */ - error = xfs_reflink_remap_extent(dest, &imap, destoff, - new_isize); + /* Remap into the destination file at the given offset. */ + imap.br_startoff = destoff; + error = xfs_reflink_remap_extent(dest, &imap, new_isize); if (error) break; @@ -1163,10 +1216,10 @@ xfs_reflink_remap_blocks( } /* Advance drange/srange */ - srcoff += range_len; - destoff += range_len; - len -= range_len; - remapped_len += range_len; + srcoff += imap.br_blockcount; + destoff += imap.br_blockcount; + len -= imap.br_blockcount; + remapped_len += imap.br_blockcount; } if (error) @@ -1177,81 +1230,6 @@ xfs_reflink_remap_blocks( } /* - * Grab the exclusive iolock for a data copy from src to dest, making sure to - * abide vfs locking order (lowest pointer value goes first) and breaking the - * layout leases before proceeding. The loop is needed because we cannot call - * the blocking break_layout() with the iolocks held, and therefore have to - * back out both locks. - */ -static int -xfs_iolock_two_inodes_and_break_layout( - struct inode *src, - struct inode *dest) -{ - int error; - - if (src > dest) - swap(src, dest); - -retry: - /* Wait to break both inodes' layouts before we start locking. */ - error = break_layout(src, true); - if (error) - return error; - if (src != dest) { - error = break_layout(dest, true); - if (error) - return error; - } - - /* Lock one inode and make sure nobody got in and leased it. */ - inode_lock(src); - error = break_layout(src, false); - if (error) { - inode_unlock(src); - if (error == -EWOULDBLOCK) - goto retry; - return error; - } - - if (src == dest) - return 0; - - /* Lock the other inode and make sure nobody got in and leased it. */ - inode_lock_nested(dest, I_MUTEX_NONDIR2); - error = break_layout(dest, false); - if (error) { - inode_unlock(src); - inode_unlock(dest); - if (error == -EWOULDBLOCK) - goto retry; - return error; - } - - return 0; -} - -/* Unlock both inodes after they've been prepped for a range clone. */ -void -xfs_reflink_remap_unlock( - struct file *file_in, - struct file *file_out) -{ - struct inode *inode_in = file_inode(file_in); - struct xfs_inode *src = XFS_I(inode_in); - struct inode *inode_out = file_inode(file_out); - struct xfs_inode *dest = XFS_I(inode_out); - bool same_inode = (inode_in == inode_out); - - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - inode_unlock(inode_out); - if (!same_inode) - inode_unlock(inode_in); -} - -/* * If we're reflinking to a point past the destination file's EOF, we must * zero any speculative post-EOF preallocations that sit between the old EOF * and the destination file offset. @@ -1313,18 +1291,12 @@ xfs_reflink_remap_prep( struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - bool same_inode = (inode_in == inode_out); - ssize_t ret; + int ret; /* Lock both files against IO */ - ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); + ret = xfs_ilock2_io_mmap(src, dest); if (ret) return ret; - if (same_inode) - xfs_ilock(src, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, - XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; @@ -1338,7 +1310,7 @@ xfs_reflink_remap_prep( ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret < 0 || *len == 0) + if (ret || *len == 0) goto out_unlock; /* Attach dquots to dest inode before changing block map */ @@ -1373,9 +1345,9 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - return 1; + return 0; out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); + xfs_iunlock2_io_mmap(src, dest); return ret; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 3e4fd46373ab..487b00434b96 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -56,7 +56,5 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, loff_t *remapped); extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); -extern void xfs_reflink_remap_unlock(struct file *file_in, - struct file *file_out); #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 4911b68f95dd..dc5b0753cd51 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -18,16 +18,20 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_rui_zone; kmem_zone_t *xfs_rud_zone; +static const struct xfs_item_ops xfs_rui_item_ops; + static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rui_log_item, rui_item); } -void +STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { @@ -44,13 +48,13 @@ xfs_rui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the RUI. */ -void +STATIC void xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); xfs_rui_item_free(ruip); } } @@ -122,17 +126,10 @@ xfs_rui_item_release( xfs_rui_release(RUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_rui_item_ops = { - .iop_size = xfs_rui_item_size, - .iop_format = xfs_rui_item_format, - .iop_unpin = xfs_rui_item_unpin, - .iop_release = xfs_rui_item_release, -}; - /* * Allocate and initialize an rui item with the given number of extents. */ -struct xfs_rui_log_item * +STATIC struct xfs_rui_log_item * xfs_rui_init( struct xfs_mount *mp, uint nextents) @@ -144,7 +141,8 @@ xfs_rui_init( if (nextents > XFS_RUI_MAX_FAST_EXTENTS) ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_zone_zalloc(xfs_rui_zone, 0); + ruip = kmem_cache_zalloc(xfs_rui_zone, + GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -160,7 +158,7 @@ xfs_rui_init( * RUI format structure. The RUI/RUD items were designed not to need any * special alignment handling. */ -int +STATIC int xfs_rui_copy_format( struct xfs_log_iovec *buf, struct xfs_rui_log_format *dst_rui_fmt) @@ -246,7 +244,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_zone_zalloc(xfs_rud_zone, 0); + rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; @@ -352,41 +350,16 @@ xfs_rmap_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); } -/* Get an RUI. */ -STATIC void * -xfs_rmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_rui_log_item *ruip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - ruip = xfs_rui_init(tp->t_mountp, count); - ASSERT(ruip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &ruip->rui_item); - return ruip; -} - /* Log rmap updates in the intent item. */ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_rui_log_item *ruip, + struct xfs_rmap_intent *rmap) { - struct xfs_rui_log_item *ruip = intent; - struct xfs_rmap_intent *rmap; uint next_extent; struct xfs_map_extent *map; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); @@ -406,58 +379,64 @@ xfs_rmap_update_log_item( rmap->ri_bmap.br_state); } +static struct xfs_log_item * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); + struct xfs_rmap_intent *rmap; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &ruip->rui_item); + if (sort) + list_sort(mp, items, xfs_rmap_update_diff_items); + list_for_each_entry(rmap, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, rmap); + return &ruip->rui_item; +} + /* Get an RUD so we can process all the deferred rmap updates. */ -STATIC void * +static struct xfs_log_item * xfs_rmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_rud(tp, intent); + return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; } /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_rmap_intent *rmap; int error; rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, done_item, - rmap->ri_type, - rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, - rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, - rmap->ri_bmap.br_state, - (struct xfs_btree_cur **)state); + error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), + rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, + rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, + rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, + state); kmem_free(rmap); return error; } -/* Clean up after processing deferred rmaps. */ -STATIC void -xfs_rmap_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_rmap_finish_one_cleanup(tp, rcur, error); -} - /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_rui_release(intent); + xfs_rui_release(RUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -473,13 +452,11 @@ xfs_rmap_update_cancel_item( const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, - .log_item = xfs_rmap_update_log_item, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_update_finish_cleanup, + .finish_cleanup = xfs_rmap_finish_one_cleanup, .cancel_item = xfs_rmap_update_cancel_item, }; @@ -487,24 +464,24 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ -int -xfs_rui_recover( - struct xfs_mount *mp, - struct xfs_rui_log_item *ruip) +STATIC int +xfs_rui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int i; - int error = 0; + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_map_extent *rmap; - xfs_fsblock_t startblock_fsb; - bool op_ok; struct xfs_rud_log_item *rudp; - enum xfs_rmap_intent_type type; - int whichfork; - xfs_exntst_t state; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - - ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)); + struct xfs_mount *mp = parent_tp->t_mountp; + xfs_fsblock_t startblock_fsb; + enum xfs_rmap_intent_type type; + xfs_exntst_t state; + bool op_ok; + int i; + int whichfork; + int error = 0; /* * First check the validity of the extents described by the @@ -539,7 +516,6 @@ xfs_rui_recover( * This will pull the RUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); xfs_rui_release(ruip); return -EFSCORRUPTED; } @@ -597,7 +573,6 @@ xfs_rui_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); error = xfs_trans_commit(tp); return error; @@ -606,3 +581,90 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_unpin = xfs_rui_item_unpin, + .iop_release = xfs_rui_item_release, + .iop_recover = xfs_rui_item_recover, + .iop_match = xfs_rui_item_match, +}; + +/* + * This routine is called to create an in-core extent rmap update + * item from the rui format structure which was logged on disk. + * It allocates an in-core rui, copies the extents from the format + * structure into it, and adds the rui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_rui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_rui_log_item *ruip; + struct xfs_rui_log_format *rui_formatp; + + rui_formatp = item->ri_buf[0].i_addr; + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); + if (error) { + xfs_rui_item_free(ruip); + return error; + } + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); + xfs_rui_release(ruip); + return 0; +} + +const struct xlog_recover_item_ops xlog_rui_item_ops = { + .item_type = XFS_LI_RUI, + .commit_pass2 = xlog_recover_rui_commit_pass2, +}; + +/* + * This routine is called when an RUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding RUI if it + * was still in the log. To do this it searches the AIL for the RUI with an id + * equal to that in the RUD format structure. If we find it we drop the RUD + * reference, which removes the RUI from the AIL and frees it. + */ +STATIC int +xlog_recover_rud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_rud_log_format *rud_formatp; + + rud_formatp = item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + + xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_rud_item_ops = { + .item_type = XFS_LI_RUD, + .commit_pass2 = xlog_recover_rud_commit_pass2, +}; diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 8708e4a5aa5c..31e6cdfff71f 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -36,11 +36,6 @@ struct kmem_zone; #define XFS_RUI_MAX_FAST_EXTENTS 16 /* - * Define RUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_RUI_RECOVERED 1 - -/* * This is the "rmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "rmap update done" log item described below. @@ -52,7 +47,6 @@ struct xfs_rui_log_item { struct xfs_log_item rui_item; atomic_t rui_refcount; atomic_t rui_next_extent; - unsigned long rui_flags; /* misc flags */ struct xfs_rui_log_format rui_format; }; @@ -77,11 +71,4 @@ struct xfs_rud_log_item { extern struct kmem_zone *xfs_rui_zone; extern struct kmem_zone *xfs_rud_zone; -struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint); -int xfs_rui_copy_format(struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt); -void xfs_rui_item_free(struct xfs_rui_log_item *); -void xfs_rui_release(struct xfs_rui_log_item *); -int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); - #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 113883c4f202..f70f1255220b 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -57,13 +57,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) /* Loop over all stats groups */ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - len += snprintf(buf + len, PATH_MAX - len, "%s", + len += scnprintf(buf + len, PATH_MAX - len, "%s", xstats[i].desc); /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - len += snprintf(buf + len, PATH_MAX - len, " %u", + len += scnprintf(buf + len, PATH_MAX - len, " %u", counter_val(stats, j)); - len += snprintf(buf + len, PATH_MAX - len, "\n"); + len += scnprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { @@ -72,9 +72,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } - len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", + len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); #else diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2094386af8ac..71ac6c1cdc36 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,39 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif +enum xfs_dax_mode { + XFS_DAX_INODE = 0, + XFS_DAX_ALWAYS = 1, + XFS_DAX_NEVER = 2, +}; + +static void +xfs_mount_set_dax_mode( + struct xfs_mount *mp, + enum xfs_dax_mode mode) +{ + switch (mode) { + case XFS_DAX_INODE: + mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER); + break; + case XFS_DAX_ALWAYS: + mp->m_flags |= XFS_MOUNT_DAX_ALWAYS; + mp->m_flags &= ~XFS_MOUNT_DAX_NEVER; + break; + case XFS_DAX_NEVER: + mp->m_flags |= XFS_MOUNT_DAX_NEVER; + mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS; + break; + } +} + +static const struct constant_table dax_param_enums[] = { + {"inode", XFS_DAX_INODE }, + {"always", XFS_DAX_ALWAYS }, + {"never", XFS_DAX_NEVER }, + {} +}; + /* * Table driven mount option parser. */ @@ -59,7 +92,7 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -103,6 +136,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("discard", Opt_discard), fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, dax_param_enums), {} }; @@ -129,7 +163,8 @@ xfs_fs_show_options( { XFS_MOUNT_GRPID, ",grpid" }, { XFS_MOUNT_DISCARD, ",discard" }, { XFS_MOUNT_LARGEIO, ",largeio" }, - { XFS_MOUNT_DAX, ",dax" }, + { XFS_MOUNT_DAX_ALWAYS, ",dax=always" }, + { XFS_MOUNT_DAX_NEVER, ",dax=never" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -305,7 +340,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); } STATIC void @@ -516,6 +551,20 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_buf_workqueue); } +static void +xfs_flush_inodes_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, struct xfs_mount, + m_flush_inodes_work); + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting @@ -526,12 +575,15 @@ void xfs_flush_inodes( struct xfs_mount *mp) { - struct super_block *sb = mp->m_super; + /* + * If flush_work() returns true then that means we waited for a flush + * which was already in progress. Don't bother running another scan. + */ + if (flush_work(&mp->m_flush_inodes_work)) + return; - if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb); - up_read(&sb->s_umount); - } + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); + flush_work(&mp->m_flush_inodes_work); } /* Catch misguided souls that try to use this interface on XFS */ @@ -685,7 +737,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); + return generic_drop_inode(inode); } static void @@ -755,7 +807,8 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_dblocks - lsize; spin_unlock(&mp->m_sb_lock); - statp->f_bfree = fdblocks - mp->m_alloc_set_aside; + /* make sure statp->f_bfree does not underflow */ + statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); @@ -821,8 +874,10 @@ xfs_restore_resvblks(struct xfs_mount *mp) * there is no log replay required to write the inodes to disk - this is the * primary difference between a sync and a quiesce. * - * Note: xfs_log_quiesce() stops background log work - the callers must ensure - * it is started again when appropriate. + * We cancel log work early here to ensure all transactions the log worker may + * run have finished before we clean up and log the superblock and write an + * unmount record. The unfreeze process is responsible for restarting the log + * worker correctly. */ void xfs_quiesce_attr( @@ -830,28 +885,17 @@ xfs_quiesce_attr( { int error = 0; - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); + cancel_delayed_work_sync(&mp->m_log->l_work); /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp); if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - xfs_log_quiesce(mp); } @@ -866,11 +910,21 @@ xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); + unsigned int flags; + int ret; + /* + * The filesystem is now frozen far enough that memory reclaim + * cannot safely operate on the filesystem. Hence we need to + * set a GFP_NOFS context here to avoid recursion deadlocks. + */ + flags = memalloc_nofs_save(); xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return xfs_sync_sb(mp, true); + ret = xfs_sync_sb(mp, true); + memalloc_nofs_restore(flags); + return ret; } STATIC int @@ -1244,7 +1298,10 @@ xfs_fc_parse_param( return 0; #ifdef CONFIG_FS_DAX case Opt_dax: - mp->m_flags |= XFS_MOUNT_DAX; + xfs_mount_set_dax_mode(mp, XFS_DAX_ALWAYS); + return 0; + case Opt_dax_enum: + xfs_mount_set_dax_mode(mp, result.uint_32); return 0; #endif default: @@ -1437,7 +1494,7 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; - if (mp->m_flags & XFS_MOUNT_DAX) { + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; xfs_warn(mp, @@ -1451,7 +1508,7 @@ xfs_fc_fill_super( if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); - mp->m_flags &= ~XFS_MOUNT_DAX; + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } if (xfs_sb_version_hasreflink(&mp->m_sb)) { xfs_alert(mp, @@ -1664,6 +1721,10 @@ xfs_fc_reconfigure( int flags = fc->sb_flags; int error; + /* version 5 superblocks always support version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + fc->sb_flags |= SB_I_VERSION; + error = xfs_fc_validate_params(new_mp); if (error) return error; @@ -1737,7 +1798,7 @@ static int xfs_init_fs_context( INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); @@ -1861,7 +1922,8 @@ xfs_init_zones(void) xfs_ili_zone = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, - SLAB_MEM_SPREAD, NULL); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index d762d42ed0ff..8e88a7ca387e 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -176,15 +176,12 @@ xfs_symlink( return -ENAMETOOLONG; ASSERT(pathlen > 0); - udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, - xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -194,7 +191,7 @@ xfs_symlink( * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. */ - if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + if (pathlen <= XFS_LITINO(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); @@ -246,8 +243,7 @@ xfs_symlink( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); + resblks -= XFS_IALLOC_SPACE_RES(mp); /* * If the symlink will fit into the inode, write it inline. */ @@ -255,7 +251,7 @@ xfs_symlink( xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_d.di_size = pathlen; - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); } else { int offset; @@ -268,8 +264,7 @@ xfs_symlink( if (error) goto out_trans_cancel; - if (resblks) - resblks -= fs_blocks; + resblks -= fs_blocks; ip->i_d.di_size = pathlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -389,7 +384,7 @@ xfs_inactive_symlink_rmt( * either 1 or 2 extents and that we can * free them all in one bunmapi call. */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 31b3bdbd2eba..021ef96d0542 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -13,7 +13,7 @@ STATIC int xfs_stats_clear_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { @@ -33,7 +33,7 @@ STATIC int xfs_panic_mask_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index e9f810fc6731..43585850f154 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -32,9 +32,11 @@ xfs_sysfs_init( struct xfs_kobj *parent_kobj, const char *name) { + struct kobject *parent; + + parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, - &parent_kobj->kobject, "%s", name); + return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); } static inline void diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index bc85b89f88ca..120398a37c2a 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -27,6 +28,7 @@ #include "xfs_log_recover.h" #include "xfs_filestream.h" #include "xfs_fsmap.h" +#include "xfs_btree_staging.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e242988f57fb..abb1d859f226 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -35,6 +35,13 @@ struct xfs_icreate_log; struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; +union xfs_btree_ptr; +struct xfs_dqtrx; + +#define XFS_ATTR_FILTER_FLAGS \ + { XFS_ATTR_ROOT, "ROOT" }, \ + { XFS_ATTR_SECURE, "SECURE" }, \ + { XFS_ATTR_INCOMPLETE, "INCOMPLETE" } DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -45,39 +52,39 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s", + "buffer %p size %u count %u firstu %u filter %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->hashval, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS) ) ) @@ -169,31 +176,31 @@ TRACE_EVENT(xfs_attr_list_node_descend, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) __field(u32, bt_hashval) __field(u32, bt_before) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; __entry->bt_hashval = be32_to_cpu(btree->hashval); __entry->bt_before = be32_to_cpu(btree->before); ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s " + "buffer %p size %u count %u firstu %u filter %s " "node hashval %u, node before %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -201,12 +208,12 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), __entry->bt_hashval, __entry->bt_before) ); @@ -858,44 +865,65 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_STRUCT__entry( __field(dev_t, dev) __field(u32, id) + __field(xfs_dqtype_t, type) __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) + __field(unsigned long long, res_rtbcount) + __field(unsigned long long, res_icount) + __field(unsigned long long, bcount) + __field(unsigned long long, rtbcount) __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) __field(unsigned long long, blk_softlimit) + __field(unsigned long long, rtb_hardlimit) + __field(unsigned long long, rtb_softlimit) __field(unsigned long long, ino_hardlimit) __field(unsigned long long, ino_softlimit) - ), \ + ), TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; - __entry->id = be32_to_cpu(dqp->q_core.d_id); - __entry->flags = dqp->dq_flags; + __entry->id = dqp->q_id; + __entry->type = dqp->q_type; + __entry->flags = dqp->q_flags; __entry->nrefs = dqp->q_nrefs; - __entry->res_bcount = dqp->q_res_bcount; - __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); - __entry->icount = be64_to_cpu(dqp->q_core.d_icount); - __entry->blk_hardlimit = - be64_to_cpu(dqp->q_core.d_blk_hardlimit); - __entry->blk_softlimit = - be64_to_cpu(dqp->q_core.d_blk_softlimit); - __entry->ino_hardlimit = - be64_to_cpu(dqp->q_core.d_ino_hardlimit); - __entry->ino_softlimit = - be64_to_cpu(dqp->q_core.d_ino_softlimit); - ), - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + + __entry->res_bcount = dqp->q_blk.reserved; + __entry->res_rtbcount = dqp->q_rtb.reserved; + __entry->res_icount = dqp->q_ino.reserved; + + __entry->bcount = dqp->q_blk.count; + __entry->rtbcount = dqp->q_rtb.count; + __entry->icount = dqp->q_ino.count; + + __entry->blk_hardlimit = dqp->q_blk.hardlimit; + __entry->blk_softlimit = dqp->q_blk.softlimit; + __entry->rtb_hardlimit = dqp->q_rtb.hardlimit; + __entry->rtb_softlimit = dqp->q_rtb.softlimit; + __entry->ino_hardlimit = dqp->q_ino.hardlimit; + __entry->ino_softlimit = dqp->q_ino.softlimit; + ), + TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u " + "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx " "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx " "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->id, - __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __entry->nrefs, __entry->res_bcount, + __entry->res_rtbcount, + __entry->res_icount, __entry->bcount, __entry->blk_hardlimit, __entry->blk_softlimit, + __entry->rtbcount, + __entry->rtb_hardlimit, + __entry->rtb_softlimit, __entry->icount, __entry->ino_hardlimit, __entry->ino_softlimit) @@ -926,6 +954,125 @@ DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); +DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); +DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } + +TRACE_EVENT(xfs_trans_mod_dquot, + TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, + unsigned int field, int64_t delta), + TP_ARGS(tp, dqp, field, delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, type) + __field(unsigned int, flags) + __field(unsigned int, dqid) + __field(unsigned int, field) + __field(int64_t, delta) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->type = dqp->q_type; + __entry->flags = dqp->q_flags; + __entry->dqid = dqp->q_id; + __entry->field = field; + __entry->delta = delta; + ), + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dqid, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), + __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS), + __entry->delta) +); + +DECLARE_EVENT_CLASS(xfs_dqtrx_class, + TP_PROTO(struct xfs_dqtrx *qtrx), + TP_ARGS(qtrx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, type) + __field(unsigned int, flags) + __field(u32, dqid) + + __field(uint64_t, blk_res) + __field(int64_t, bcount_delta) + __field(int64_t, delbcnt_delta) + + __field(uint64_t, rtblk_res) + __field(uint64_t, rtblk_res_used) + __field(int64_t, rtbcount_delta) + __field(int64_t, delrtb_delta) + + __field(uint64_t, ino_res) + __field(uint64_t, ino_res_used) + __field(int64_t, icount_delta) + ), + TP_fast_assign( + __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev; + __entry->type = qtrx->qt_dquot->q_type; + __entry->flags = qtrx->qt_dquot->q_flags; + __entry->dqid = qtrx->qt_dquot->q_id; + + __entry->blk_res = qtrx->qt_blk_res; + __entry->bcount_delta = qtrx->qt_bcount_delta; + __entry->delbcnt_delta = qtrx->qt_delbcnt_delta; + + __entry->rtblk_res = qtrx->qt_rtblk_res; + __entry->rtblk_res_used = qtrx->qt_rtblk_res_used; + __entry->rtbcount_delta = qtrx->qt_rtbcount_delta; + __entry->delrtb_delta = qtrx->qt_delrtb_delta; + + __entry->ino_res = qtrx->qt_ino_res; + __entry->ino_res_used = qtrx->qt_ino_res_used; + __entry->icount_delta = qtrx->qt_icount_delta; + ), + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + "blk_res %llu bcount_delta %lld delbcnt_delta %lld " + "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " + "ino_res %llu ino_res_used %llu icount_delta %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dqid, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), + + __entry->blk_res, + __entry->bcount_delta, + __entry->delbcnt_delta, + + __entry->rtblk_res, + __entry->rtblk_res_used, + __entry->rtbcount_delta, + __entry->delrtb_delta, + + __entry->ino_res, + __entry->ino_res_used, + __entry->icount_delta) +) + +#define DEFINE_DQTRX_EVENT(name) \ +DEFINE_EVENT(xfs_dqtrx_class, name, \ + TP_PROTO(struct xfs_dqtrx *qtrx), \ + TP_ARGS(qtrx)) +DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas); +DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before); +DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after); DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_PROTO(struct xlog *log, struct xlog_ticket *tic), @@ -995,8 +1142,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) -DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); -DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); @@ -1005,12 +1150,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -1701,7 +1847,8 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) - __field(int, flags) + __field(unsigned int, attr_filter) + __field(unsigned int, attr_flags) __field(int, op_flags) ), TP_fast_assign( @@ -1712,11 +1859,12 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; - __entry->flags = args->flags; + __entry->attr_filter = args->attr_filter; + __entry->attr_flags = args->attr_flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x flags %s op_flags %s", + "hashval 0x%x filter %s flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1724,7 +1872,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), + __print_flags(__entry->attr_flags, "|", + { XATTR_CREATE, "CREATE" }, + { XATTR_REPLACE, "REPLACE" }), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -1886,8 +2038,8 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->which = which; __entry->ino = ip->i_ino; - __entry->format = ip->i_d.di_format; - __entry->nex = ip->i_d.di_nextents; + __entry->format = ip->i_df.if_format; + __entry->nex = ip->i_df.if_nextents; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), @@ -3041,8 +3193,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); -TRACE_EVENT(xfs_reflink_remap_blocks_loop, +TRACE_EVENT(xfs_reflink_remap_blocks, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, xfs_fileoff_t doffset), @@ -3073,59 +3224,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop, __entry->dest_ino, __entry->dest_lblk) ); -TRACE_EVENT(xfs_reflink_punch_range, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len), - TP_ARGS(ip, lblk, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len) -); -TRACE_EVENT(xfs_reflink_remap, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len, - __entry->new_pblk) -); DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); /* dedupe tracepoints */ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); @@ -3571,7 +3677,6 @@ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); -DEFINE_KMEM_EVENT(kmem_zone_alloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3594,6 +3699,151 @@ TRACE_EVENT(xfs_check_new_dalign, __entry->calc_rootino) ) +TRACE_EVENT(xfs_btree_commit_afakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, levels) + __field(unsigned int, blocks) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = cur->bc_ag.agno; + __entry->agbno = cur->bc_ag.afake->af_root; + __entry->levels = cur->bc_ag.afake->af_levels; + __entry->blocks = cur->bc_ag.afake->af_blocks; + ), + TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->levels, + __entry->blocks, + __entry->agbno) +) + +TRACE_EVENT(xfs_btree_commit_ifakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, levels) + __field(unsigned int, blocks) + __field(int, whichfork) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->levels = cur->bc_ino.ifake->if_levels; + __entry->blocks = cur->bc_ino.ifake->if_blocks; + __entry->whichfork = cur->bc_ino.whichfork; + ), + TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->agino, + __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __entry->levels, + __entry->blocks) +) + +TRACE_EVENT(xfs_btree_bload_level_geometry, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t nr_this_level, unsigned int nr_per_block, + unsigned int desired_npb, uint64_t blocks, + uint64_t blocks_with_extra), + TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks, + blocks_with_extra), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned int, nlevels) + __field(uint64_t, nr_this_level) + __field(unsigned int, nr_per_block) + __field(unsigned int, desired_npb) + __field(unsigned long long, blocks) + __field(unsigned long long, blocks_with_extra) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->nr_this_level = nr_this_level; + __entry->nr_per_block = nr_per_block; + __entry->desired_npb = desired_npb; + __entry->blocks = blocks; + __entry->blocks_with_extra = blocks_with_extra; + ), + TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->nlevels, + __entry->nr_this_level, + __entry->nr_per_block, + __entry->desired_npb, + __entry->blocks, + __entry->blocks_with_extra) +) + +TRACE_EVENT(xfs_btree_bload_block, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t block_idx, uint64_t nr_blocks, + union xfs_btree_ptr *ptr, unsigned int nr_records), + TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned long long, block_idx) + __field(unsigned long long, nr_blocks) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, nr_records) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->block_idx = block_idx; + __entry->nr_blocks = nr_blocks; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + xfs_fsblock_t fsb = be64_to_cpu(ptr->l); + + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); + __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); + } else { + __entry->agno = cur->bc_ag.agno; + __entry->agbno = be32_to_cpu(ptr->s); + } + __entry->nr_records = nr_records; + ), + TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->block_idx, + __entry->nr_blocks, + __entry->agno, + __entry->agbno, + __entry->nr_records) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3b208f9a865c..ed72867b1a19 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,6 +9,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -67,7 +68,6 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); - atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -90,7 +90,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_zone_zalloc(xfs_trans_zone, 0); + ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); /* * Initialize the new transaction structure. @@ -107,7 +107,8 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | + (tp->t_flags & XFS_TRANS_RES_FDBLKS); /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); @@ -124,8 +125,6 @@ xfs_trans_dup( xfs_defer_move(ntp, tp); xfs_trans_dup_dqinfo(tp, ntp); - - atomic_inc(&tp->t_mountp->m_active_trans); return ntp; } @@ -150,8 +149,9 @@ xfs_trans_reserve( uint blocks, uint rtextents) { - int error = 0; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -162,7 +162,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; @@ -191,9 +191,9 @@ xfs_trans_reserve( if (tp->t_ticket != NULL) { ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, @@ -213,7 +213,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + error = xfs_mod_frextents(mp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -229,7 +229,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -237,7 +237,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); + xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -263,7 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, 0); + tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -273,7 +273,8 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || + xfs_sb_version_haslazysbcount(&mp->m_sb)); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -297,15 +298,19 @@ xfs_trans_alloc( /* * Create an empty transaction with no reservation. This is a defensive - * mechanism for routines that query metadata without actually modifying - * them -- if the metadata being queried is somehow cross-linked (think a - * btree block pointer that points higher in the tree), we risk deadlock. - * However, blocks grabbed as part of a transaction can be re-grabbed. - * The verifiers will notice the corrupt block and the operation will fail - * back to userspace without deadlocking. + * mechanism for routines that query metadata without actually modifying them -- + * if the metadata being queried is somehow cross-linked (think a btree block + * pointer that points higher in the tree), we risk deadlock. However, blocks + * grabbed as part of a transaction can be re-grabbed. The verifiers will + * notice the corrupt block and the operation will fail back to userspace + * without deadlocking. * - * Note the zero-length reservation; this transaction MUST be cancelled - * without any dirty data. + * Note the zero-length reservation; this transaction MUST be cancelled without + * any dirty data. + * + * Callers should obtain freeze protection to avoid a conflict with fs freezing + * where we can be grabbing buffers at the same time that freeze is trying to + * drain the buffer LRU list. */ int xfs_trans_alloc_empty( @@ -363,6 +368,20 @@ xfs_trans_mod_sb( tp->t_blk_res_used += (uint)-delta; if (tp->t_blk_res_used > tp->t_blk_res) xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { + int64_t blkres_delta; + + /* + * Return freed blocks directly to the reservation + * instead of the global pool, being careful not to + * overflow the trans counter. This is used to preserve + * reservation across chains of transaction rolls that + * repeatedly free and allocate blocks. + */ + blkres_delta = min_t(int64_t, delta, + UINT_MAX - tp->t_blk_res); + tp->t_blk_res += blkres_delta; + delta -= blkres_delta; } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) @@ -450,7 +469,7 @@ xfs_trans_apply_sb_deltas( int whole = 0; bp = xfs_trans_getsb(tp, tp->t_mountp); - sbp = XFS_BUF_TO_SBP(bp); + sbp = bp->b_addr; /* * Check that superblock mods match the mods made to AGF counters. @@ -527,57 +546,9 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } -STATIC int -xfs_sb_mod8( - uint8_t *field, - int8_t delta) -{ - int8_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod32( - uint32_t *field, - int32_t delta) -{ - int32_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod64( - uint64_t *field, - int64_t delta) -{ - int64_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations - * and apply superblock counter changes to the in-core superblock. The + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and + * apply superblock counter changes to the in-core superblock. The * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT * applied to the in-core superblock. The idea is that that has already been * done. @@ -586,7 +557,12 @@ xfs_sb_mod64( * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. + * + * Deltas for the inode count are +/-64, hence we use a large batch size of 128 + * so we don't need to take the counter lock on every update. */ +#define XFS_ICOUNT_BATCH 128 + void xfs_trans_unreserve_and_mod_sb( struct xfs_trans *tp) @@ -622,20 +598,21 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - if (error) - goto out; + ASSERT(!error); } if (idelta) { - error = xfs_mod_icount(mp, idelta); - if (error) - goto out_undo_fdblocks; + percpu_counter_add_batch(&mp->m_icount, idelta, + XFS_ICOUNT_BATCH); + if (idelta < 0) + ASSERT(__percpu_counter_compare(&mp->m_icount, 0, + XFS_ICOUNT_BATCH) >= 0); } if (ifreedelta) { - error = xfs_mod_ifree(mp, ifreedelta); - if (error) - goto out_undo_icount; + percpu_counter_add(&mp->m_ifree, ifreedelta); + if (ifreedelta < 0) + ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0); } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) @@ -643,95 +620,23 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); - if (rtxdelta) { - error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); - if (error) - goto out_undo_ifree; - } - - if (tp->t_dblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); - if (error) - goto out_undo_frextents; - } - if (tp->t_agcount_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); - if (error) - goto out_undo_dblocks; - } - if (tp->t_imaxpct_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); - if (error) - goto out_undo_agcount; - } - if (tp->t_rextsize_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, - tp->t_rextsize_delta); - if (error) - goto out_undo_imaxpct; - } - if (tp->t_rbmblocks_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, - tp->t_rbmblocks_delta); - if (error) - goto out_undo_rextsize; - } - if (tp->t_rblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); - if (error) - goto out_undo_rbmblocks; - } - if (tp->t_rextents_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rextents, - tp->t_rextents_delta); - if (error) - goto out_undo_rblocks; - } - if (tp->t_rextslog_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, - tp->t_rextslog_delta); - if (error) - goto out_undo_rextents; - } + mp->m_sb.sb_frextents += rtxdelta; + mp->m_sb.sb_dblocks += tp->t_dblocks_delta; + mp->m_sb.sb_agcount += tp->t_agcount_delta; + mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; + mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; + mp->m_sb.sb_rblocks += tp->t_rblocks_delta; + mp->m_sb.sb_rextents += tp->t_rextents_delta; + mp->m_sb.sb_rextslog += tp->t_rextslog_delta; spin_unlock(&mp->m_sb_lock); - return; -out_undo_rextents: - if (tp->t_rextents_delta) - xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); -out_undo_rblocks: - if (tp->t_rblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); -out_undo_rbmblocks: - if (tp->t_rbmblocks_delta) - xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); -out_undo_rextsize: - if (tp->t_rextsize_delta) - xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); -out_undo_imaxpct: - if (tp->t_rextsize_delta) - xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); -out_undo_agcount: - if (tp->t_agcount_delta) - xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); -out_undo_dblocks: - if (tp->t_dblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); -out_undo_frextents: - if (rtxdelta) - xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); -out_undo_ifree: - spin_unlock(&mp->m_sb_lock); - if (ifreedelta) - xfs_mod_ifree(mp, -ifreedelta); -out_undo_icount: - if (idelta) - xfs_mod_icount(mp, -idelta); -out_undo_fdblocks: - if (blkdelta) - xfs_mod_fdblocks(mp, -blkdelta, rsvd); -out: - ASSERT(error == 0); + /* + * Debug checks outside of the spinlock so they don't lock up the + * machine if they fail. + */ + ASSERT(mp->m_sb.sb_imax_pct >= 0); + ASSERT(mp->m_sb.sb_rextslog >= 0); return; } @@ -999,9 +904,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); - if (commit_lsn == -1 && !error) - error = -EIO; + if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + else + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1060,7 +966,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_done(mp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 752c7fef9de7..b752501818d2 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -37,10 +37,6 @@ struct xfs_log_item { unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ - void (*li_cb)(struct xfs_buf *, - struct xfs_log_item *); - /* buffer item iodone */ - /* callback func */ const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ @@ -59,12 +55,14 @@ struct xfs_log_item { #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */ #define XFS_LI_FLAGS \ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1 << XFS_LI_ABORTED), "ABORTED" }, \ { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1 << XFS_LI_DIRTY), "DIRTY" }, \ + { (1 << XFS_LI_RECOVERED), "RECOVERED" } struct xfs_item_ops { unsigned flags; @@ -76,7 +74,8 @@ struct xfs_item_ops { void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); - void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); + int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); + bool (*iop_match)(struct xfs_log_item *item, uint64_t id); }; /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 00cc5b8734be..dbb69b4bf3ed 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -32,6 +32,7 @@ STATIC void xfs_ail_check( struct xfs_ail *ailp, struct xfs_log_item *lip) + __must_hold(&ailp->ail_lock) { struct xfs_log_item *prev_lip; struct xfs_log_item *next_lip; @@ -108,17 +109,25 @@ xfs_ail_next( * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ +static xfs_lsn_t +__xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip = xfs_ail_min(ailp); + + if (lip) + return lip->li_lsn; + return 0; +} + xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; + xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); - lip = xfs_ail_min(ailp); - if (lip) - lsn = lip->li_lsn; + lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; @@ -336,6 +345,49 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +/* + * Requeue a failed buffer for writeback. + * + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + */ +static inline int +xfsaild_resubmit_item( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_buf *bp = lip->li_buf; + + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_delwri_queue(bp, buffer_list)) { + xfs_buf_unlock(bp); + return XFS_ITEM_FLUSHING; + } + + /* protected by ail_lock */ + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + if (bp->b_flags & _XBF_INODES) + clear_bit(XFS_LI_FAILED, &lip->li_flags); + else + xfs_clear_li_failed(lip); + } + + xfs_buf_unlock(bp); + return XFS_ITEM_SUCCESS; +} + static inline uint xfsaild_push_item( struct xfs_ail *ailp, @@ -356,6 +408,8 @@ xfsaild_push_item( */ if (!lip->li_ops->iop_push) return XFS_ITEM_PINNED; + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) + return xfsaild_resubmit_item(lip, &ailp->ail_buf_list); return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } @@ -394,16 +448,10 @@ xfsaild_push( target = ailp->ail_target; ailp->ail_target_prev = target; + /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); - if (!lip) { - /* - * If the AIL is empty or our push has reached the end we are - * done now. - */ - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + if (!lip) goto out_done; - } XFS_STATS_INC(mp, xs_push_ail); @@ -432,7 +480,7 @@ xfsaild_push( * inode buffer is locked because we already pushed the * updates to it as part of inode clustering. * - * We do not want to to stop flushing just because lots + * We do not want to stop flushing just because lots * of items are already being flushed, but we need to * re-try the flushing relatively soon if most of the * AIL is being flushed. @@ -467,7 +515,7 @@ xfsaild_push( /* * Are there too many items we can't do anything with? * - * If we we are skipping too many items because we can't flush + * If we are skipping too many items because we can't flush * them or they are already being flushed, we back off and * given them time to complete whatever operation is being * done. i.e. remove pressure from the AIL while we can't make @@ -485,6 +533,8 @@ xfsaild_push( break; lsn = lip->li_lsn; } + +out_done: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); @@ -492,7 +542,6 @@ xfsaild_push( ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, target) >= 0) { -out_done: /* * We reached the target or the AIL is empty, so wait a bit * longer for I/O to complete and remove pushed items from the @@ -529,8 +578,9 @@ xfsaild( { struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + unsigned int noreclaim_flag; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); set_freezable(); while (1) { @@ -583,7 +633,8 @@ xfsaild( */ smp_rmb(); if (!xfs_ail_min(ailp) && - ailp->ail_target == ailp->ail_target_prev) { + ailp->ail_target == ailp->ail_target_prev && + list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); freezable_schedule(); tout = 0; @@ -601,6 +652,7 @@ xfsaild( tout = xfsaild_push(ailp); } + memalloc_noreclaim_restore(noreclaim_flag); return 0; } @@ -678,6 +730,28 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +xfs_ail_update_finish( + struct xfs_ail *ailp, + xfs_lsn_t old_lsn) __releases(ailp->ail_lock) +{ + struct xfs_mount *mp = ailp->ail_mount; + + /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { + spin_unlock(&ailp->ail_lock); + return; + } + + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); + + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); + spin_unlock(&ailp->ail_lock); + xfs_log_space_wake(mp); +} + /* * xfs_trans_ail_update - bulk AIL insertion operation. * @@ -709,7 +783,7 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; - int mlip_changed = 0; + xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); @@ -724,9 +798,10 @@ xfs_trans_ail_update_bulk( continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); + if (mlip == lip && !tail_lsn) + tail_lsn = lip->li_lsn; + xfs_ail_delete(ailp, lip); - if (mlip == lip) - mlip_changed = 1; } else { trace_xfs_ail_insert(lip, 0, lsn); } @@ -737,66 +812,58 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - spin_unlock(&ailp->ail_lock); + xfs_ail_update_finish(ailp, tail_lsn); +} - xfs_log_space_wake(ailp->ail_mount); - } else { - spin_unlock(&ailp->ail_lock); - } +/* Insert a log item into the AIL. */ +void +xfs_trans_ail_insert( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + spin_lock(&ailp->ail_lock); + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool +/* + * Delete one log item from the AIL. + * + * If this item was at the tail of the AIL, return the LSN of the log item so + * that we can use it to check if the LSN of the tail of the log has moved + * when finishing up the AIL delete process in xfs_ail_update_finish(). + */ +xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); + xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); - xfs_clear_li_failed(lip); clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; - return mlip == lip; + if (mlip == lip) + return lsn; + return 0; } -/** - * Remove a log items from the AIL - * - * @xfs_trans_ail_delete_bulk takes an array of log items that all need to - * removed from the AIL. The caller is already holding the AIL lock, and done - * all the checks necessary to ensure the items passed in via @log_items are - * ready for deletion. This includes checking that the items are in the AIL. - * - * For each log item to be removed, unlink it from the AIL, clear the IN_AIL - * flag from the item and reset the item's lsn to 0. If we remove the first - * item in the AIL, update the log tail to match the new minimum LSN in the - * AIL. - * - * This function will not drop the AIL lock until all items are removed from - * the AIL to minimise the amount of lock traffic on the AIL. This does not - * greatly increase the AIL hold time, but does significantly reduce the amount - * of traffic on the lock, especially during IO completion. - * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. - */ void xfs_trans_ail_delete( - struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock) + int shutdown_type) { + struct xfs_ail *ailp = lip->li_ailp; struct xfs_mount *mp = ailp->ail_mount; - bool mlip_changed; + xfs_lsn_t tail_lsn; + spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); @@ -805,17 +872,10 @@ xfs_trans_ail_delete( return; } - mlip_changed = xfs_ail_delete_one(ailp, lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - - spin_unlock(&ailp->ail_lock); - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + /* xfs_ail_update_finish() drops the AIL lock */ + xfs_clear_li_failed(lip); + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } int diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 08174ffa2118..11cd666cd99a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -465,24 +465,16 @@ xfs_trans_dirty_buf( ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(bp->b_iodone == NULL || - bp->b_iodone == xfs_buf_iodone_callbacks); /* * Mark the buffer as needing to be written out eventually, * and set its iodone function to remove the buffer's buf log * item from the AIL and free it when the buffer is flushed - * to disk. See xfs_buf_attach_iodone() for more details - * on li_cb and xfs_buf_iodone_callbacks(). - * If we end up aborting this transaction, we trap this buffer - * inside the b_bdstrat callback so that this won't get written to - * disk. + * to disk. */ bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); - bp->b_iodone = xfs_buf_iodone_callbacks; - bip->bli_item.li_cb = xfs_buf_iodone; /* * If we invalidated the buffer within this transaction, then @@ -626,6 +618,7 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -650,7 +643,7 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = xfs_buf_iodone; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -675,6 +668,7 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -785,5 +779,6 @@ xfs_trans_dquot_buf( break; } + bp->b_flags |= _XBF_DQUOTS; xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index d1b9869bc5fa..c6ba7ef18e06 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -15,6 +15,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_trace.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -155,14 +156,19 @@ xfs_trans_get_dqtrx( int i; struct xfs_dqtrx *qa; - if (XFS_QM_ISUDQ(dqp)) + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_USER: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; - else if (XFS_QM_ISGDQ(dqp)) + break; + case XFS_DQTYPE_GROUP: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; - else if (XFS_QM_ISPDQ(dqp)) + break; + case XFS_DQTYPE_PROJ: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; - else + break; + default: return NULL; + } for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || @@ -203,6 +209,11 @@ xfs_trans_mod_dquot( if (qtrx->qt_dquot == NULL) qtrx->qt_dquot = dqp; + if (delta) { + trace_xfs_trans_mod_dquot_before(qtrx); + trace_xfs_trans_mod_dquot(tp, dqp, field, delta); + } + switch (field) { /* @@ -266,6 +277,10 @@ xfs_trans_mod_dquot( default: ASSERT(0); } + + if (delta) + trace_xfs_trans_mod_dquot_after(qtrx); + tp->t_flags |= XFS_TRANS_DQ_DIRTY; } @@ -293,6 +308,37 @@ xfs_trans_dqlockedjoin( } } +/* Apply dqtrx changes to the quota reservation counters. */ +static inline void +xfs_apply_quota_reservation_deltas( + struct xfs_dquot_res *res, + uint64_t reserved, + int64_t res_used, + int64_t count_delta) +{ + if (reserved != 0) { + /* + * Subtle math here: If reserved > res_used (the normal case), + * we're simply subtracting the unused transaction quota + * reservation from the dquot reservation. + * + * If, however, res_used > reserved, then we have allocated + * more quota blocks than were reserved for the transaction. + * We must add that excess to the dquot reservation since it + * tracks (usage + resv) and by definition we didn't reserve + * that excess. + */ + res->reserved -= abs(reserved - res_used); + } else if (count_delta != 0) { + /* + * These blks were never reserved, either inside a transaction + * or outside one (in a delayed allocation). Also, this isn't + * always a negative number since we sometimes deliberately + * skip quota reservations. + */ + res->reserved += count_delta; + } +} /* * Called by xfs_trans_commit() and similar in spirit to @@ -309,7 +355,6 @@ xfs_trans_apply_dquot_deltas( int i, j; struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; - struct xfs_disk_dquot *d; int64_t totalbdelta; int64_t totalrtbdelta; @@ -328,6 +373,8 @@ xfs_trans_apply_dquot_deltas( xfs_trans_dqlockedjoin(tp, qa); for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + uint64_t blk_res_used; + qtrx = &qa[i]; /* * The array of dquots is filled @@ -341,7 +388,6 @@ xfs_trans_apply_dquot_deltas( /* * adjust the actual number of blocks used */ - d = &dqp->q_core; /* * The issue here is - sometimes we don't make a blkquota @@ -360,38 +406,46 @@ xfs_trans_apply_dquot_deltas( qtrx->qt_delbcnt_delta; totalrtbdelta = qtrx->qt_rtbcount_delta + qtrx->qt_delrtb_delta; + + if (totalbdelta != 0 || totalrtbdelta != 0 || + qtrx->qt_icount_delta != 0) { + trace_xfs_trans_apply_dquot_deltas_before(dqp); + trace_xfs_trans_apply_dquot_deltas(qtrx); + } + #ifdef DEBUG if (totalbdelta < 0) - ASSERT(be64_to_cpu(d->d_bcount) >= - -totalbdelta); + ASSERT(dqp->q_blk.count >= -totalbdelta); if (totalrtbdelta < 0) - ASSERT(be64_to_cpu(d->d_rtbcount) >= - -totalrtbdelta); + ASSERT(dqp->q_rtb.count >= -totalrtbdelta); if (qtrx->qt_icount_delta < 0) - ASSERT(be64_to_cpu(d->d_icount) >= - -qtrx->qt_icount_delta); + ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta); #endif if (totalbdelta) - be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + dqp->q_blk.count += totalbdelta; if (qtrx->qt_icount_delta) - be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + dqp->q_ino.count += qtrx->qt_icount_delta; if (totalrtbdelta) - be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + dqp->q_rtb.count += totalrtbdelta; + + if (totalbdelta != 0 || totalrtbdelta != 0 || + qtrx->qt_icount_delta != 0) + trace_xfs_trans_apply_dquot_deltas_after(dqp); /* * Get any default limits in use. * Start/reset the timer(s) if needed. */ - if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); - xfs_qm_adjust_dqtimers(tp->t_mountp, d); + if (dqp->q_id) { + xfs_qm_adjust_dqlimits(dqp); + xfs_qm_adjust_dqtimers(dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; /* * add this to the list of items to get logged */ @@ -401,78 +455,31 @@ xfs_trans_apply_dquot_deltas( * In case of delayed allocations, there's no * reservation that a transaction structure knows of. */ - if (qtrx->qt_blk_res != 0) { - uint64_t blk_res_used = 0; - - if (qtrx->qt_bcount_delta > 0) - blk_res_used = qtrx->qt_bcount_delta; - - if (qtrx->qt_blk_res != blk_res_used) { - if (qtrx->qt_blk_res > blk_res_used) - dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res - - blk_res_used); - else - dqp->q_res_bcount -= (xfs_qcnt_t) - (blk_res_used - - qtrx->qt_blk_res); - } - } else { - /* - * These blks were never reserved, either inside - * a transaction or outside one (in a delayed - * allocation). Also, this isn't always a - * negative number since we sometimes - * deliberately skip quota reservations. - */ - if (qtrx->qt_bcount_delta) { - dqp->q_res_bcount += - (xfs_qcnt_t)qtrx->qt_bcount_delta; - } - } + blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta); + xfs_apply_quota_reservation_deltas(&dqp->q_blk, + qtrx->qt_blk_res, blk_res_used, + qtrx->qt_bcount_delta); + /* * Adjust the RT reservation. */ - if (qtrx->qt_rtblk_res != 0) { - if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { - if (qtrx->qt_rtblk_res > - qtrx->qt_rtblk_res_used) - dqp->q_res_rtbcount -= (xfs_qcnt_t) - (qtrx->qt_rtblk_res - - qtrx->qt_rtblk_res_used); - else - dqp->q_res_rtbcount -= (xfs_qcnt_t) - (qtrx->qt_rtblk_res_used - - qtrx->qt_rtblk_res); - } - } else { - if (qtrx->qt_rtbcount_delta) - dqp->q_res_rtbcount += - (xfs_qcnt_t)qtrx->qt_rtbcount_delta; - } + xfs_apply_quota_reservation_deltas(&dqp->q_rtb, + qtrx->qt_rtblk_res, + qtrx->qt_rtblk_res_used, + qtrx->qt_rtbcount_delta); /* * Adjust the inode reservation. */ - if (qtrx->qt_ino_res != 0) { - ASSERT(qtrx->qt_ino_res >= - qtrx->qt_ino_res_used); - if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) - dqp->q_res_icount -= (xfs_qcnt_t) - (qtrx->qt_ino_res - - qtrx->qt_ino_res_used); - } else { - if (qtrx->qt_icount_delta) - dqp->q_res_icount += - (xfs_qcnt_t)qtrx->qt_icount_delta; - } - - ASSERT(dqp->q_res_bcount >= - be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_icount >= - be64_to_cpu(dqp->q_core.d_icount)); - ASSERT(dqp->q_res_rtbcount >= - be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + xfs_apply_quota_reservation_deltas(&dqp->q_ino, + qtrx->qt_ino_res, + qtrx->qt_ino_res_used, + qtrx->qt_icount_delta); + + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); } } } @@ -516,7 +523,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_blk_res) { xfs_dqlock(dqp); locked = true; - dqp->q_res_bcount -= + dqp->q_blk.reserved -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { @@ -524,7 +531,7 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqlock(dqp); locked = true; } - dqp->q_res_icount -= + dqp->q_ino.reserved -= (xfs_qcnt_t)qtrx->qt_ino_res; } @@ -533,7 +540,7 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqlock(dqp); locked = true; } - dqp->q_res_rtbcount -= + dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } if (locked) @@ -549,21 +556,80 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - enum quota_type qtype; + enum quota_type qtype; - if (dqp->dq_flags & XFS_DQ_PROJ) + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_PROJ: qtype = PRJQUOTA; - else if (dqp->dq_flags & XFS_DQ_USER) + break; + case XFS_DQTYPE_USER: qtype = USRQUOTA; - else + break; + case XFS_DQTYPE_GROUP: qtype = GRPQUOTA; + break; + default: + return; + } - quota_send_warning(make_kqid(&init_user_ns, qtype, - be32_to_cpu(dqp->q_core.d_id)), + quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id), mp->m_super->s_dev, type); } /* + * Decide if we can make an additional reservation against a quota resource. + * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal. + * + * Note that we assume that the numeric difference between the inode and block + * warning codes will always be 3 since it's userspace ABI now, and will never + * decrease the quota reservation, so the *BELOW messages are irrelevant. + */ +static inline int +xfs_dqresv_check( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + int64_t delta, + bool *fatal) +{ + xfs_qcnt_t hardlimit = res->hardlimit; + xfs_qcnt_t softlimit = res->softlimit; + xfs_qcnt_t total_count = res->reserved + delta; + + BUILD_BUG_ON(QUOTA_NL_BHARDWARN != QUOTA_NL_IHARDWARN + 3); + BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3); + BUILD_BUG_ON(QUOTA_NL_BSOFTWARN != QUOTA_NL_ISOFTWARN + 3); + + *fatal = false; + if (delta <= 0) + return QUOTA_NL_NOWARN; + + if (!hardlimit) + hardlimit = qlim->hard; + if (!softlimit) + softlimit = qlim->soft; + + if (hardlimit && total_count > hardlimit) { + *fatal = true; + return QUOTA_NL_IHARDWARN; + } + + if (softlimit && total_count > softlimit) { + time64_t now = ktime_get_real_seconds(); + + if ((res->timer != 0 && now > res->timer) || + (res->warnings != 0 && res->warnings >= qlim->warn)) { + *fatal = true; + return QUOTA_NL_ISOFTLONGWARN; + } + + res->warnings++; + return QUOTA_NL_ISOFTWARN; + } + + return QUOTA_NL_NOWARN; +} + +/* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also * if the blk reservation is for RT or regular blocks. @@ -578,110 +644,58 @@ xfs_trans_dqresv( long ninos, uint flags) { - xfs_qcnt_t hardlimit; - xfs_qcnt_t softlimit; - time64_t timer; - xfs_qwarncnt_t warns; - xfs_qwarncnt_t warnlimit; - xfs_qcnt_t total_count; - xfs_qcnt_t *resbcountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; - + struct xfs_dquot_res *blkres; + struct xfs_quota_limits *qlim; xfs_dqlock(dqp); - defq = xfs_get_defquota(dqp, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); if (flags & XFS_TRANS_DQ_RES_BLKS) { - hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); - if (!hardlimit) - hardlimit = defq->bhardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); - if (!softlimit) - softlimit = defq->bsoftlimit; - timer = be32_to_cpu(dqp->q_core.d_btimer); - warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; - resbcountp = &dqp->q_res_bcount; + blkres = &dqp->q_blk; + qlim = &defq->blk; } else { - ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); - hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); - if (!hardlimit) - hardlimit = defq->rtbhardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); - if (!softlimit) - softlimit = defq->rtbsoftlimit; - timer = be32_to_cpu(dqp->q_core.d_rtbtimer); - warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; - resbcountp = &dqp->q_res_rtbcount; + blkres = &dqp->q_rtb; + qlim = &defq->rtb; } - if ((flags & XFS_QMOPT_FORCE_RES) == 0 && - dqp->q_core.d_id && - ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || - (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || - (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { - if (nblks > 0) { + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && + xfs_dquot_is_enforced(dqp)) { + int quota_nl; + bool fatal; + + /* + * dquot is locked already. See if we'd go over the hardlimit + * or exceed the timelimit if we'd reserve resources. + */ + quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal); + if (quota_nl != QUOTA_NL_NOWARN) { /* - * dquot is locked already. See if we'd go over the - * hardlimit or exceed the timelimit if we allocate - * nblks. + * Quota block warning codes are 3 more than the inode + * codes, which we check above. */ - total_count = *resbcountp + nblks; - if (hardlimit && total_count > hardlimit) { - xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); + xfs_quota_warn(mp, dqp, quota_nl + 3); + if (fatal) goto error_return; - } - if (softlimit && total_count > softlimit) { - if ((timer != 0 && - ktime_get_real_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - xfs_quota_warn(mp, dqp, - QUOTA_NL_BSOFTLONGWARN); - goto error_return; - } - - xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); - } } - if (ninos > 0) { - total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; - timer = be32_to_cpu(dqp->q_core.d_itimer); - warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; - hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); - if (!hardlimit) - hardlimit = defq->ihardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); - if (!softlimit) - softlimit = defq->isoftlimit; - - if (hardlimit && total_count > hardlimit) { - xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); + + quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos, + &fatal); + if (quota_nl != QUOTA_NL_NOWARN) { + xfs_quota_warn(mp, dqp, quota_nl); + if (fatal) goto error_return; - } - if (softlimit && total_count > softlimit) { - if ((timer != 0 && - ktime_get_real_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - xfs_quota_warn(mp, dqp, - QUOTA_NL_ISOFTLONGWARN); - goto error_return; - } - xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); - } } } /* * Change the reservation, but not the actual usage. - * Note that q_res_bcount = q_core.d_bcount + resv + * Note that q_blk.reserved = q_blk.count + resv */ - (*resbcountp) += (xfs_qcnt_t)nblks; - if (ninos != 0) - dqp->q_res_icount += (xfs_qcnt_t)ninos; + blkres->reserved += (xfs_qcnt_t)nblks; + dqp->q_ino.reserved += (xfs_qcnt_t)ninos; /* * note the reservation amt in the trans struct too, @@ -702,16 +716,16 @@ xfs_trans_dqresv( XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); - ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); xfs_dqunlock(dqp); return 0; error_return: xfs_dqunlock(dqp); - if (flags & XFS_QMOPT_ENOSPC) + if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; } @@ -751,8 +765,7 @@ xfs_trans_reserve_quota_bydquots( ASSERT(flags & XFS_QMOPT_RESBLK_MASK); if (udqp) { - error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, - (flags & ~XFS_QMOPT_ENOSPC)); + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags); if (error) return error; } @@ -803,16 +816,12 @@ xfs_trans_reserve_quota_nblks( if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; - if (XFS_IS_PQUOTA_ON(mp)) - flags |= XFS_QMOPT_ENOSPC; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == - XFS_TRANS_DQ_RES_RTBLKS || - (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == - XFS_TRANS_DQ_RES_BLKS); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS); /* * Reserve nblks against these dquots, with trans as the mediator. @@ -865,7 +874,8 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); + tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone, + GFP_KERNEL | __GFP_NOFAIL); } void diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 2e073c1c4614..3004aeac9110 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,24 +91,13 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); -void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock); +void xfs_trans_ail_insert(struct xfs_ail *ailp, struct xfs_log_item *lip, + xfs_lsn_t lsn); -static inline void -xfs_trans_ail_remove( - struct xfs_log_item *lip, - int shutdown_type) -{ - struct xfs_ail *ailp = lip->li_ailp; - - spin_lock(&ailp->ail_lock); - /* xfs_trans_ail_delete() drops the AIL lock */ - if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) - xfs_trans_ail_delete(ailp, lip, shutdown_type); - else - spin_unlock(&ailp->ail_lock); -} +xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) + __releases(ailp->ail_lock); +void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type); void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index b0fedb543f97..bca48b308c02 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,53 +12,29 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); - int error, asize = size; - size_t namelen = strlen(name); - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - value = NULL; - } + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .name = name, + .namelen = strlen(name), + .value = value, + .valuelen = size, + }; + int error; - error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, - &asize, xflags); + error = xfs_attr_get(&args); if (error) return error; - return asize; -} - -void -xfs_forget_acl( - struct inode *inode, - const char *name, - int xflags) -{ - /* - * Invalidate any cached ACLs if the user has bypassed the ACL - * interface. We don't validate the content whatsoever so it is caller - * responsibility to provide data in valid format and ensure i_mode is - * consistent. - */ - if (xflags & ATTR_ROOT) { -#ifdef CONFIG_XFS_POSIX_ACL - if (!strcmp(name, SGI_ACL_FILE)) - forget_cached_acl(inode, ACL_TYPE_ACCESS); - else if (!strcmp(name, SGI_ACL_DEFAULT)) - forget_cached_acl(inode, ACL_TYPE_DEFAULT); -#endif - } + return args.valuelen; } static int @@ -66,25 +42,20 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .attr_flags = flags, + .name = name, + .namelen = strlen(name), + .value = (void *)value, + .valuelen = size, + }; int error; - size_t namelen = strlen(name); - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - - if (value) - error = xfs_attr_set(ip, name, namelen, (void *)value, size, - xflags); - else - error = xfs_attr_remove(ip, name, namelen, xflags); - if (!error) - xfs_forget_acl(inode, name, xflags); + error = xfs_attr_set(&args); + if (!error && (handler->flags & XFS_ATTR_ROOT)) + xfs_forget_acl(inode, name); return error; } @@ -97,14 +68,14 @@ static const struct xattr_handler xfs_xattr_user_handler = { static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .flags = ATTR_ROOT, + .flags = XFS_ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .flags = ATTR_SECURE, + .flags = XFS_ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; @@ -134,7 +105,7 @@ __xfs_xattr_put_listent( if (context->count < 0 || context->seen_enough) return; - if (!context->alist) + if (!context->buffer) goto compute_size; arraytop = context->count + prefix_len + namelen + 1; @@ -143,7 +114,7 @@ __xfs_xattr_put_listent( context->seen_enough = 1; return; } - offset = (char *)context->alist + context->count; + offset = context->buffer + context->count; strncpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ @@ -218,7 +189,6 @@ xfs_vn_listxattr( size_t size) { struct xfs_attr_list_context context; - struct attrlist_cursor_kern cursor = { 0 }; struct inode *inode = d_inode(dentry); int error; @@ -227,14 +197,13 @@ xfs_vn_listxattr( */ memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); - context.cursor = &cursor; context.resynch = 1; - context.alist = size ? data : NULL; + context.buffer = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - error = xfs_attr_list_int(&context); + error = xfs_attr_list(&context); if (error) return error; if (context.count < 0) |