From f88ae46b09e93ef07ac9efaf85df62adb5ba58e6 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:16:37 -0800 Subject: xfs: glean crc status from mp not flags in xfs_btree_init_block_int xfs_btree_init_block_int() can determine whether crcs are in effect without the passed-in XFS_BTREE_CRC_BLOCKS flag; the mp argument allows us to determine this from the superblock. Remove the flag from callers, and use xfs_sb_version_hascrc(&mp->m_sb) internally instead. This removes one difference between the if & else cases in the callers. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.c | 6 ++++-- fs/xfs/xfs_fsops.c | 14 ++++++-------- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bfc00de5c6f1..1d4b8d5edaaf 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -743,7 +743,7 @@ xfs_bmap_extents_to_btree( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + XFS_BTREE_LONG_PTRS); else xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, XFS_BMAP_MAGIC, 1, 1, ip->i_ino, @@ -820,7 +820,7 @@ try_another_ag: if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block_int(mp, ablock, abp->b_bn, XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + XFS_BTREE_LONG_PTRS); else xfs_btree_init_block_int(mp, ablock, abp->b_bn, XFS_BMAP_MAGIC, 0, 0, ip->i_ino, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d9be241fc86f..a80bf8080b1c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -74,7 +74,7 @@ xfs_bmdr_to_bmbt( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + XFS_BTREE_LONG_PTRS); else xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, XFS_BMAP_MAGIC, 0, 0, ip->i_ino, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 21e6a6ab6b9a..c91823c202b6 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1090,6 +1090,8 @@ xfs_btree_init_block_int( __u64 owner, unsigned int flags) { + int crc = xfs_sb_version_hascrc(&mp->m_sb); + buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); @@ -1097,7 +1099,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_LONG_PTRS) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); - if (flags & XFS_BTREE_CRC_BLOCKS) { + if (crc) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); @@ -1110,7 +1112,7 @@ xfs_btree_init_block_int( buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - if (flags & XFS_BTREE_CRC_BLOCKS) { + if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); buf->bb_u.s.bb_owner = cpu_to_be32(__owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 242e8091296d..21e3cdbaebbc 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -354,7 +354,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, - agno, XFS_BTREE_CRC_BLOCKS); + agno, 0); else xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, agno, 0); @@ -383,7 +383,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, - agno, XFS_BTREE_CRC_BLOCKS); + agno, 0); else xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, agno, 0); @@ -414,7 +414,7 @@ xfs_growfs_data_private( } xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, - agno, XFS_BTREE_CRC_BLOCKS); + agno, 0); block = XFS_BUF_TO_BLOCK(bp); @@ -490,7 +490,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, - agno, XFS_BTREE_CRC_BLOCKS); + agno, 0); else xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, agno, 0); @@ -515,8 +515,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, - 0, 0, agno, - XFS_BTREE_CRC_BLOCKS); + 0, 0, agno, 0); else xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, 0, agno, 0); @@ -541,8 +540,7 @@ xfs_growfs_data_private( } xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, - 0, 0, agno, - XFS_BTREE_CRC_BLOCKS); + 0, 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); -- cgit v1.2.3 From af7d20fd83d9e2b3111a847e4220bf943e2d531c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:16:38 -0800 Subject: xfs: make xfs_btree_magic more generic Right now the xfs_btree_magic() define takes only a cursor; change this to take crc and btnum args to make it more generically useful, and move to a function. This will allow xfs_btree_init_block_int callers which don't have a cursor to make use of the xfs_magics array, which will happen in the next patch. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 34 ++++++++++++++++++++++++++-------- fs/xfs/libxfs/xfs_btree.h | 2 ++ 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c91823c202b6..18afab315445 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -50,8 +50,18 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, XFS_REFC_CRC_MAGIC } }; -#define xfs_btree_magic(cur) \ - xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] + +__uint32_t +xfs_btree_magic( + int crc, + xfs_btnum_t btnum) +{ + __uint32_t magic = xfs_magics[crc][btnum]; + + /* Ensure we asked for crc for crc-only magics. */ + ASSERT(magic != 0); + return magic; +} STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lblock( @@ -62,10 +72,13 @@ xfs_btree_check_lblock( { int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ + xfs_btnum_t btnum = cur->bc_btnum; + int crc; mp = cur->bc_mp; + crc = xfs_sb_version_hascrc(&mp->m_sb); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (crc) { lblock_ok = lblock_ok && uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid) && @@ -74,7 +87,7 @@ xfs_btree_check_lblock( } lblock_ok = lblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -110,13 +123,16 @@ xfs_btree_check_sblock( struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ int sblock_ok = 1; /* block passes checks */ + xfs_btnum_t btnum = cur->bc_btnum; + int crc; mp = cur->bc_mp; + crc = xfs_sb_version_hascrc(&mp->m_sb); agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (crc) { sblock_ok = sblock_ok && uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid) && @@ -125,7 +141,7 @@ xfs_btree_check_sblock( } sblock_ok = sblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -1142,7 +1158,9 @@ xfs_btree_init_block_cur( int level, int numrecs) { - __u64 owner; + __u64 owner; + int crc = xfs_sb_version_hascrc(&cur->bc_mp->m_sb); + xfs_btnum_t btnum = cur->bc_btnum; /* * we can pull the owner from the cursor right now as the different @@ -1156,7 +1174,7 @@ xfs_btree_init_block_cur( owner = cur->bc_private.a.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - xfs_btree_magic(cur), level, numrecs, + xfs_btree_magic(crc, btnum), level, numrecs, owner, cur->bc_flags); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index b69b947c4c1b..95ea6ed0c14b 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -76,6 +76,8 @@ union xfs_btree_rec { #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) #define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) +__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); + /* * For logging record fields. */ -- cgit v1.2.3 From b6f41e448277ff080fea734b93121e6cd7513f0c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:16:39 -0800 Subject: xfs: remove boilerplate around xfs_btree_init_block Now that xfs_btree_init_block_int is able to determine crc status from the passed-in mp, we can determine the proper magic as well if we are given a btree number, rather than an explicit magic value. Change xfs_btree_init_block[_int] callers to pass in the btree number, and let xfs_btree_init_block_int use the xfs_magics array via the xfs_btree_magic macro to determine which magic value is needed. This makes all of the if (crc) / else stanzas identical, and the if/else can be removed, leading to a single, common init_block call. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 19 ++++--------------- fs/xfs/libxfs/xfs_bmap_btree.c | 10 ++-------- fs/xfs/libxfs/xfs_btree.c | 11 +++++------ fs/xfs/libxfs/xfs_btree.h | 4 ++-- fs/xfs/xfs_fsops.c | 31 ++++++------------------------- 5 files changed, 19 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1d4b8d5edaaf..d3da53e6a927 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -740,15 +740,9 @@ xfs_bmap_extents_to_btree( * Fill in the root. */ block = ifp->if_broot; - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BTNUM_BMAP, 1, 1, ip->i_ino, XFS_BTREE_LONG_PTRS); - else - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BMAP_MAGIC, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS); - /* * Need a cursor. Can't allocate until bb_level is filled in. */ @@ -817,13 +811,8 @@ try_another_ag: */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, ablock, abp->b_bn, - XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); - else - xfs_btree_init_block_int(mp, ablock, abp->b_bn, - XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index a80bf8080b1c..f93072b58a58 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -71,15 +71,9 @@ xfs_bmdr_to_bmbt( xfs_bmbt_key_t *tkp; __be64 *tpp; - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); - else - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); - rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 18afab315445..421efa0ef778 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1100,13 +1100,14 @@ xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags) { int crc = xfs_sb_version_hascrc(&mp->m_sb); + __u32 magic = xfs_btree_magic(crc, btnum); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); @@ -1141,14 +1142,14 @@ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags) { xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - magic, level, numrecs, owner, flags); + btnum, level, numrecs, owner, flags); } STATIC void @@ -1159,8 +1160,6 @@ xfs_btree_init_block_cur( int numrecs) { __u64 owner; - int crc = xfs_sb_version_hascrc(&cur->bc_mp->m_sb); - xfs_btnum_t btnum = cur->bc_btnum; /* * we can pull the owner from the cursor right now as the different @@ -1174,7 +1173,7 @@ xfs_btree_init_block_cur( owner = cur->bc_private.a.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - xfs_btree_magic(crc, btnum), level, numrecs, + cur->bc_btnum, level, numrecs, owner, cur->bc_flags); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 95ea6ed0c14b..cdd4f05a5976 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -380,7 +380,7 @@ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, @@ -391,7 +391,7 @@ xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 21e3cdbaebbc..6ccaae9eb0ee 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -352,12 +352,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, - agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); @@ -381,12 +376,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, - agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); @@ -413,7 +403,7 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0, agno, 0); block = XFS_BUF_TO_BLOCK(bp); @@ -488,12 +478,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, - agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -513,12 +498,8 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, 0, 0, agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, - 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -539,7 +520,7 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, + xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, 0, 0, agno, 0); error = xfs_bwrite(bp); -- cgit v1.2.3 From 8ff6daa17b6a64e59bbabaa116b9bd854fa4da1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 23:20:26 -0800 Subject: iomap: constify struct iomap_ops Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/dax.c | 6 +++--- fs/ext2/ext2.h | 2 +- fs/ext2/inode.c | 4 ++-- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 2 +- fs/internal.h | 2 +- fs/iomap.c | 18 +++++++++--------- fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_iomap.h | 4 ++-- include/linux/dax.h | 8 ++++---- include/linux/iomap.h | 14 +++++++------- 11 files changed, 33 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 3af2da5e64ce..78b9651576c6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1074,7 +1074,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, */ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; @@ -1118,7 +1118,7 @@ static int dax_fault_return(int error) * necessary locking for the page fault to proceed successfully. */ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -1317,7 +1317,7 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, } int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) + pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = address & PMD_MASK; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 37e2be784ac7..5e64de9c5093 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,7 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; -extern struct iomap_ops ext2_iomap_ops; +extern const struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index f073bfca694b..128cce540645 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -842,13 +842,13 @@ ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, return 0; } -struct iomap_ops ext2_iomap_ops = { +const struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; #else /* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ -struct iomap_ops ext2_iomap_ops; +const struct iomap_ops ext2_iomap_ops; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2163c1e69f2a..ce70403c4707 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3253,7 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } -extern struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_ops; #endif /* __KERNEL__ */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 88d57af1b516..96c2e12cc5d6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3420,7 +3420,7 @@ orphan_del: return ret; } -struct iomap_ops ext4_iomap_ops = { +const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; diff --git a/fs/internal.h b/fs/internal.h index b63cf3af2dc2..11c6d89dce9c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -182,7 +182,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, void *data, struct iomap *iomap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap_ops *ops, void *data, + unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor); /* direct-io.c: */ diff --git a/fs/iomap.c b/fs/iomap.c index 354a123f170e..7f08ca03d95d 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -41,7 +41,7 @@ */ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - struct iomap_ops *ops, void *data, iomap_actor_t actor) + const struct iomap_ops *ops, void *data, iomap_actor_t actor) { struct iomap iomap = { 0 }; loff_t written = 0, ret; @@ -232,7 +232,7 @@ again: ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct inode *inode = iocb->ki_filp->f_mapping->host; loff_t pos = iocb->ki_pos, ret = 0, written = 0; @@ -315,7 +315,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, - struct iomap_ops *ops) + const struct iomap_ops *ops) { loff_t ret; @@ -395,7 +395,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - struct iomap_ops *ops) + const struct iomap_ops *ops) { loff_t ret; @@ -415,7 +415,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - struct iomap_ops *ops) + const struct iomap_ops *ops) { unsigned blocksize = (1 << inode->i_blkbits); unsigned off = pos & (blocksize - 1); @@ -443,7 +443,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); @@ -542,7 +542,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, struct iomap_ops *ops) + loff_t start, loff_t len, const struct iomap_ops *ops) { struct fiemap_ctx ctx; loff_t ret; @@ -836,8 +836,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, } ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, - iomap_dio_end_io_t end_io) +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, iomap_dio_end_io_t end_io) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1aa3abd67b36..25ed98324b27 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1144,7 +1144,7 @@ xfs_file_iomap_end( return 0; } -struct iomap_ops xfs_iomap_ops = { +const struct iomap_ops xfs_iomap_ops = { .iomap_begin = xfs_file_iomap_begin, .iomap_end = xfs_file_iomap_end, }; @@ -1190,6 +1190,6 @@ out_unlock: return error; } -struct iomap_ops xfs_xattr_iomap_ops = { +const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6d45cf01fcff..705224b66b6a 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -33,7 +33,7 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); -extern struct iomap_ops xfs_iomap_ops; -extern struct iomap_ops xfs_xattr_iomap_ops; +extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/include/linux/dax.h b/include/linux/dax.h index 24ad71173995..2983e52efd07 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -37,9 +37,9 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) } ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, - struct iomap_ops *ops); + const struct iomap_ops *ops); int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops); + const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, @@ -72,7 +72,7 @@ static inline unsigned int dax_radix_order(void *entry) return 0; } int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); + pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { @@ -80,7 +80,7 @@ static inline unsigned int dax_radix_order(void *entry) } static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, - struct iomap_ops *ops) + const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a4c94b86401e..891459caa278 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -72,17 +72,17 @@ struct iomap_ops { }; ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, - struct iomap_ops *ops); + const struct iomap_ops *ops); int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, - struct iomap_ops *ops); + const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - struct iomap_ops *ops); + const struct iomap_ops *ops); int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops); + const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, struct iomap_ops *ops); + loff_t start, loff_t len, const struct iomap_ops *ops); /* * Flags for direct I/O ->end_io: @@ -92,6 +92,6 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, unsigned flags); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - struct iomap_ops *ops, iomap_dio_end_io_t end_io); + const struct iomap_ops *ops, iomap_dio_end_io_t end_io); #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From 64f61ab6040c9f04ba181cca7580212f23b89f74 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:21:08 -0800 Subject: xfs: remove unused struct declarations After scratching my head looking for "xfs_busy_extent" I realized it's not used; it's xfs_extent_busy, and the declaration for the other name is bogus. Remove that and a few others as well. (struct xfs_log_callback is used, but the 2nd declaration is unnecessary). Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_log_recover.h | 1 - fs/xfs/xfs_log.h | 1 - fs/xfs/xfs_trace.h | 1 - fs/xfs/xfs_trans.h | 1 - 4 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index d9f65e2d5cc8..29a01ec89dd0 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -42,7 +42,6 @@ typedef struct xlog_recover_item { xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; -struct xlog_tid; typedef struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index b5e71072fde5..cc5a9f1574e7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -124,7 +124,6 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 69c5bcd9a51b..643222784c3b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2245,7 +2245,6 @@ DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); /* deferred ops */ struct xfs_defer_pending; -struct xfs_defer_intake; struct xfs_defer_ops; DECLARE_EVENT_CLASS(xfs_defer_class, diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 61b7fbdd3ebd..1646f659b60f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -32,7 +32,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_trans_res; struct xfs_dquot_acct; -struct xfs_busy_extent; struct xfs_rud_log_item; struct xfs_rui_log_item; struct xfs_btree_cur; -- cgit v1.2.3 From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Jan 2017 23:22:55 -0800 Subject: xfs: pull up iolock from xfs_free_eofblocks() xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from different contexts where the lock may or may not be held. The need_iolock parameter exists for this reason, to indicate whether xfs_free_eofblocks() must acquire the iolock itself before it can proceed. This is ugly and confusing. Simplify the semantics of xfs_free_eofblocks() to require the caller to acquire the iolock appropriately and kill the need_iolock parameter. While here, the mp param can be removed as well as the xfs_mount is accessible from the xfs_inode structure. This patch does not change behavior. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 41 ++++++++++++++++------------------------ fs/xfs/xfs_bmap_util.h | 3 +-- fs/xfs/xfs_icache.c | 24 +++++++++++++++--------- fs/xfs/xfs_inode.c | 51 +++++++++++++++++++++++++++----------------------- 4 files changed, 60 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c1417919ab0a..9319ee9759d4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -917,17 +917,18 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) */ int xfs_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip, - bool need_iolock) + struct xfs_inode *ip) { - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; + struct xfs_trans *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); /* * Figure out if there are any blocks beyond the end @@ -944,6 +945,10 @@ xfs_free_eofblocks( error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* + * If there are blocks after the end of file, truncate the file to its + * current size to free them up. + */ if (!error && (nimaps != 0) && (imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks)) { @@ -954,22 +959,10 @@ xfs_free_eofblocks( if (error) return error; - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) - return -EAGAIN; - } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -997,8 +990,6 @@ xfs_free_eofblocks( } xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); } return error; } @@ -1415,7 +1406,7 @@ xfs_shift_file_space( * into the accessible region of the file. */ if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); + error = xfs_free_eofblocks(ip); if (error) return error; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 68a621a8e0c0..f1005393785c 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,8 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); -int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - bool need_iolock); +int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, struct xfs_swapext *sx); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 70ca4f608321..c6b698f0fed9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1322,7 +1322,7 @@ xfs_inode_free_eofblocks( int flags, void *args) { - int ret; + int ret = 0; struct xfs_eofblocks *eofb = args; bool need_iolock = true; int match; @@ -1358,19 +1358,25 @@ xfs_inode_free_eofblocks( return 0; /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. + * A scan owner implies we already hold the iolock. Skip it here + * to avoid deadlock. */ if (eofb->eof_scan_owner == ip->i_ino) need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); - - /* don't revisit the inode if we're not waiting */ - if (ret == -EAGAIN && !(flags & SYNC_WAIT)) - ret = 0; + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index de32f0fe47c8..edfa6a55b064 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1691,33 +1691,35 @@ xfs_release( if (xfs_can_free_eofblocks(ip, false)) { + /* + * Check if the inode is being opened, written and closed + * frequently and we have delayed allocation blocks outstanding + * (e.g. streaming writes from the NFS server), truncating the + * blocks past EOF will cause fragmentation to occur. + * + * In this case don't do the truncation, but we have to be + * careful how we detect this case. Blocks beyond EOF show up as + * i_delayed_blks even when the inode is clean, so we need to + * truncate them away first before checking for a dirty release. + * Hence on the first dirty close we will still remove the + * speculative allocation, but after that we will leave it in + * place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; /* * If we can't get the iolock just skip truncating the blocks * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the + * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != -EAGAIN) - return error; + if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + error = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (error) + return error; + } /* delalloc blocks after truncation means it really is dirty */ if (ip->i_delayed_blks) @@ -1904,8 +1906,11 @@ xfs_inactive( * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. */ - if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(mp, ip, false); + if (xfs_can_free_eofblocks(ip, true)) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } return; } -- cgit v1.2.3 From c3155097ad89a956579bc305856a1f2878494e52 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Jan 2017 23:22:56 -0800 Subject: xfs: sync eofblocks scans under iolock are livelock prone The xfs_eofblocks.eof_scan_owner field is an internal field to facilitate invoking eofb scans from the kernel while under the iolock. This is necessary because the eofb scan acquires the iolock of each inode. Synchronous scans are invoked on certain buffered write failures while under iolock. In such cases, the scan owner indicates that the context for the scan already owns the particular iolock and prevents a double lock deadlock. eofblocks scans while under iolock are still livelock prone in the event of multiple parallel scans, however. If multiple buffered writes to different inodes fail and invoke eofblocks scans at the same time, each scan avoids a deadlock with its own inode by virtue of the eof_scan_owner field, but will never be able to acquire the iolock of the inode from the parallel scan. Because the low free space scans are invoked with SYNC_WAIT, the scan will not return until it has processed every tagged inode and thus both scans will spin indefinitely on the iolock being held across the opposite scan. This problem can be reproduced reliably by generic/224 on systems with higher cpu counts (x16). To avoid this problem, simplify the semantics of eofblocks scans to never invoke a scan while under iolock. This means that the buffered write context must drop the iolock before the scan. It must reacquire the lock before the write retry and also repeat the initial write checks, as the original state might no longer be valid once the iolock was dropped. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 13 +++++++++---- fs/xfs/xfs_icache.c | 45 +++++++-------------------------------------- fs/xfs/xfs_icache.h | 2 -- 3 files changed, 16 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index bbb9eb6811b2..0a29739f785e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -614,8 +614,10 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; + int iolock; +write_retry: + iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); @@ -625,7 +627,6 @@ xfs_file_buffered_aio_write( /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); -write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) @@ -641,18 +642,21 @@ write_retry: * running at the same time. */ if (ret == -EDQUOT && !enospc) { + xfs_iunlock(ip, iolock); enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; enospc = xfs_inode_free_quota_cowblocks(ip); if (enospc) goto write_retry; + iolock = 0; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; enospc = 1; xfs_flush_inodes(ip->i_mount); - eofb.eof_scan_owner = ip->i_ino; /* for locking */ + + xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); goto write_retry; @@ -660,7 +664,8 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_iunlock(ip, iolock); + if (iolock) + xfs_iunlock(ip, iolock); return ret; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c6b698f0fed9..7234b9748c36 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,11 +1324,8 @@ xfs_inode_free_eofblocks( { int ret = 0; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1356,27 +1353,19 @@ xfs_inode_free_eofblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it here - * to avoid deadlock. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ - if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (flags & SYNC_WAIT) ret = -EAGAIN; return ret; } ret = xfs_free_eofblocks(ip); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -1423,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks( struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* - * Set the scan owner to avoid a potential livelock. Otherwise, the scan - * can repeatedly trylock on the inode we're currently processing. We - * run a sync scan to increase effectiveness and use the union filter to + * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_scan_owner = ip->i_ino; eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { @@ -1583,12 +1567,9 @@ xfs_inode_free_cowblocks( { int ret; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1621,28 +1602,16 @@ xfs_inode_free_cowblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* Free the CoW blocks */ - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - } + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); - if (need_iolock) { - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index a1e02f4708ab..8a7c849b4dea 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,7 +27,6 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; - xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; - dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { -- cgit v1.2.3 From e4229d6b0bc9280f29624faf170cf76a9f1ca60e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Jan 2017 23:22:57 -0800 Subject: xfs: fix eofblocks race with file extending async dio writes It's possible for post-eof blocks to end up being used for direct I/O writes. dio write performs an upfront unwritten extent allocation, sends the dio and then updates the inode size (if necessary) on write completion. If a file release occurs while a file extending dio write is in flight, it is possible to mistake the post-eof blocks for speculative preallocation and incorrectly truncate them from the inode. This means that the resulting dio write completion can discover a hole and allocate new blocks rather than perform unwritten extent conversion. This requires a strange mix of I/O and is thus not likely to reproduce in real world workloads. It is intermittently reproduced by generic/299. The error manifests as an assert failure due to transaction overrun because the aforementioned write completion transaction has only reserved enough blocks for btree operations: XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, \ file: fs/xfs//xfs_trans.c, line: 309 The root cause is that xfs_free_eofblocks() uses i_size to truncate post-eof blocks from the inode, but async, file extending direct writes do not update i_size until write completion, long after inode locks are dropped. Therefore, xfs_free_eofblocks() effectively truncates the inode to the incorrect size. Update xfs_free_eofblocks() to serialize against dio similar to how extending writes are serialized against i_size updates before post-eof block zeroing. Specifically, wait on dio while under the iolock. This ensures that dio write completions have updated i_size before post-eof blocks are processed. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 9319ee9759d4..eb890ed1ed5c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -959,6 +959,9 @@ xfs_free_eofblocks( if (error) return error; + /* wait on dio to ensure i_size has settled */ + inode_dio_wait(VFS_I(ip)); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { -- cgit v1.2.3 From 1dbba0863468f509f3fccb498b34f1b1e24356d9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:24:28 -0800 Subject: xfs: remove unused full argument from bmap The "full" argument was used only by the fiemap formatter, which is now gone with the iomap updates. Remove the unused arg. Signed-off-by: Eric Sandeen Reviewed-by: Alex Elder Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 6 ++---- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_ioctl.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index eb890ed1ed5c..7c3bfafffba8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -787,11 +787,9 @@ xfs_getbmap( xfs_iunlock(ip, XFS_IOLOCK_SHARED); for (i = 0; i < cur_ext; i++) { - int full = 0; /* user array is full */ - /* format results & advance arg */ - error = formatter(&arg, &out[i], &full); - if (error || full) + error = formatter(&arg, &out[i]); + if (error) break; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index f1005393785c..135d8267e284 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -35,7 +35,7 @@ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); /* bmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *); int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, xfs_bmap_format_t formatter, void *arg); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c67cfb451fd3..cf1363dbf32b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1524,7 +1524,7 @@ out_drop_write: } STATIC int -xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +xfs_getbmap_format(void **ap, struct getbmapx *bmv) { struct getbmap __user *base = (struct getbmap __user *)*ap; @@ -1567,7 +1567,7 @@ xfs_ioc_getbmap( } STATIC int -xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +xfs_getbmapx_format(void **ap, struct getbmapx *bmv) { struct getbmapx __user *base = (struct getbmapx __user *)*ap; -- cgit v1.2.3 From 4b5bd5bf3fb182dc504b1b64e0331300f156e756 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:13:57 -0800 Subject: xfs: fix toctou race when locking an inode to access the data map We use di_format and if_flags to decide whether we're grabbing the ilock in btree mode (btree extents not loaded) or shared mode (anything else), but the state of those fields can be changed by other threads that are also trying to load the btree extents -- IFEXTENTS gets set before the _bmap_read_extents call and cleared if it fails. We don't actually need to have IFEXTENTS set until after the bmbt records are successfully loaded and validated, which will fix the race between multiple threads trying to read the same directory. The next patch strengthens directory bmbt validation by refusing to open the directory if reading the bmbt to start directory readahead fails. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 222e103356c6..421341f93bea 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -497,15 +497,14 @@ xfs_iread_extents( * We know that the size is valid (it's checked in iformat_btree) */ ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; xfs_iext_add(ifp, 0, nextents); error = xfs_bmap_read_extents(tp, ip, whichfork); if (error) { xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; return error; } xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + ifp->if_flags |= XFS_IFEXTENTS; return 0; } /* -- cgit v1.2.3 From 7a652bbe366464267190c2792a32ce4fff5595ef Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:13:58 -0800 Subject: xfs: fail _dir_open when readahead fails When we open a directory, we try to readahead block 0 of the directory on the assumption that we're going to need it soon. If the bmbt is corrupt, the directory will never be usable and the readahead fails immediately, so we might as well prevent the directory from being opened at all. This prevents a subsequent read or modify operation from hitting it and taking the fs offline. NOTE: We're only checking for early failures in the block mapping, not the readahead directory block itself. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_da_btree.c | 6 ++---- fs/xfs/libxfs/xfs_da_btree.h | 2 +- fs/xfs/xfs_file.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index f2dc1a950c85..1bdf2888295b 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2633,7 +2633,7 @@ out_free: /* * Readahead the dir/attr block. */ -xfs_daddr_t +int xfs_da_reada_buf( struct xfs_inode *dp, xfs_dablk_t bno, @@ -2664,7 +2664,5 @@ out_free: if (mapp != &map) kmem_free(mapp); - if (error) - return -1; - return mappedbno; + return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 98c75cbe6ac2..4e29cb6a3627 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, +int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0a29739f785e..032c8a74824a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -913,9 +913,9 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(ip, 0, -1); + error = xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); - return 0; + return error; } STATIC int -- cgit v1.2.3 From d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:13:58 -0800 Subject: xfs: filter out obviously bad btree pointers Don't let anybody load an obviously bad btree pointer. Since the values come from disk, we must return an error, not just ASSERT. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen --- fs/xfs/libxfs/xfs_bmap.c | 5 +---- fs/xfs/libxfs/xfs_btree.c | 3 ++- fs/xfs/libxfs/xfs_btree.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d3da53e6a927..2e91eb66d32f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1267,7 +1267,6 @@ xfs_bmap_read_extents( /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ - bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : @@ -1280,9 +1279,7 @@ xfs_bmap_read_extents( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 421efa0ef778..c3decedc9455 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -826,7 +826,8 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - ASSERT(fsbno != NULLFSBLOCK); + if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, mp->m_bsize, lock, &bp, ops); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index cdd4f05a5976..4bb62580a7fd 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -458,7 +458,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) /* -- cgit v1.2.3 From b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:13:59 -0800 Subject: xfs: check for obviously bad level values in the bmbt root We can't handle a bmbt that's taller than BTREE_MAXLEVELS, and there's no such thing as a zero-level bmbt (for that we have extents format), so if we see this, send back an error code. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 421341f93bea..25c1e078aef6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,6 +26,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_btree.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -429,11 +430,13 @@ xfs_iformat_btree( /* REFERENCED */ int nrecs; int size; + int level; ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); /* * blow out if -- fork has less extents than can fit in @@ -446,7 +449,8 @@ xfs_iformat_btree( XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, -- cgit v1.2.3 From de14c5f541e78c59006bee56f6c5c2ef1ca07272 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:14:00 -0800 Subject: xfs: verify free block header fields Perform basic sanity checking of the directory free block header fields so that we avoid hanging the system on invalid data. (Granted that just means that now we shutdown on directory write, but that seems better than hanging...) Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2_node.c | 51 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 75a557432d0f..bbd1238852b3 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .verify_write = xfs_dir3_free_write_verify, }; +/* Everything ok in the free block header? */ +static bool +xfs_dir3_free_header_check( + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + unsigned int firstdb; + int maxbests; + + maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); + firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - + xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * + maxbests; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (be32_to_cpu(hdr3->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr3->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) + return false; + } else { + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (be32_to_cpu(hdr->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) + return false; + } + return true; +} static int __xfs_dir3_free_read( @@ -168,11 +204,22 @@ __xfs_dir3_free_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { + xfs_buf_ioerror(*bpp, -EFSCORRUPTED); + xfs_verifier_error(*bpp); + xfs_trans_brelse(tp, *bpp); + return -EFSCORRUPTED; + } /* try read returns without an error or *bpp if it lands in a hole */ - if (!err && tp && *bpp) + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); - return err; + + return 0; } int -- cgit v1.2.3 From 05a630d76bd3f39baf0eecfa305bed2820796dee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:14:01 -0800 Subject: xfs: allow unwritten extents in the CoW fork In the data fork, we only allow extents to perform the following state transitions: delay -> real <-> unwritten There's no way to move directly from a delalloc reservation to an /unwritten/ allocated extent. However, for the CoW fork we want to be able to do the following to each extent: delalloc -> unwritten -> written -> remapped to data fork This will help us to avoid a race in the speculative CoW preallocation code between a first thread that is allocating a CoW extent and a second thread that is remapping part of a file after a write. In order to do this, however, we need two things: first, we have to be able to transition from da to unwritten, and second the function that converts between real and unwritten has to be made aware of the cow fork. Do both of those things. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 80 ++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2e91eb66d32f..dcffbb09444e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1850,6 +1850,7 @@ xfs_bmap_add_extent_delay_real( */ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_state(ep, new->br_state); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); (*nextents)++; @@ -2188,6 +2189,7 @@ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ @@ -2207,12 +2209,14 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - struct xfs_mount *mp = tp->t_mountp; + struct xfs_mount *mp = ip->i_mount; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(*idx >= 0); ASSERT(*idx <= xfs_iext_count(ifp)); @@ -2271,7 +2275,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(&ip->i_df) - 1) { + if (*idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2311,7 +2315,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents -= 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2354,7 +2359,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2389,7 +2395,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_set_state(ep, newext); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2501,7 +2508,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2579,7 +2587,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2627,7 +2636,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 2, &r[0], state); - ip->i_d.di_nextents += 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2681,17 +2691,17 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, - 0, &tmp_logflags, XFS_DATA_FORK); + 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; if (error) goto done; @@ -2703,7 +2713,7 @@ xfs_bmap_add_extent_unwritten_real( *curp = cur; } - xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); done: *logflagsp |= rval; return error; @@ -4354,10 +4364,16 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_NORM; /* - * A wasdelay extent has been initialized, so shouldn't be flagged - * as unwritten. + * In the data fork, a wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + * + * For the cow fork, however, we convert delalloc reservations + * (extents allocated for speculative preallocation) to + * allocated unwritten extents, and only convert the unwritten + * extents to real extents when we're about to write the data. */ - if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && + (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4408,8 +4424,6 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; - ASSERT(whichfork != XFS_COW_FORK); - /* * Modify (by adding) the state flag, if writing. */ @@ -4434,8 +4448,8 @@ xfs_bmapi_convert_unwritten( return error; } - error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->dfops, + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, + &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4444,8 +4458,12 @@ xfs_bmapi_convert_unwritten( * in the transaction for the sake of fsync(), even if nothing has * changed, because fsync() will not force the log for this transaction * unless it sees the inode pinned. + * + * Note: If we're only converting cow fork extents, there aren't + * any on-disk updates to make, so we don't need to log anything. */ - bma->logflags |= tmp_logflags | XFS_ILOG_CORE; + if (whichfork != XFS_COW_FORK) + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -4519,15 +4537,15 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); - ASSERT(tp != NULL); + ASSERT(tp != NULL || + (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == + (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -5542,8 +5560,8 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, dfops, - &logflags); + whichfork, &lastx, &cur, &del, + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5596,8 +5614,9 @@ __xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &prev, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &prev, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; @@ -5605,8 +5624,9 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &del, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &del, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; -- cgit v1.2.3 From 5eda43000064a69a39fb7869cc63c9571535ad29 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:14:02 -0800 Subject: xfs: mark speculative prealloc CoW fork extents unwritten Christoph Hellwig pointed out that there's a potentially nasty race when performing simultaneous nearby directio cow writes: "Thread 1 writes a range from B to c " B --------- C p "a little later thread 2 writes from A to B " A --------- B p [editor's note: the 'p' denote cowextsize boundaries, which I added to make this more clear] "but the code preallocates beyond B into the range where thread "1 has just written, but ->end_io hasn't been called yet. "But once ->end_io is called thread 2 has already allocated "up to the extent size hint into the write range of thread 1, "so the end_io handler will splice the unintialized blocks from "that preallocation back into the file right after B." We can avoid this race by ensuring that thread 1 cannot accidentally remap the blocks that thread 2 allocated (as part of speculative preallocation) as part of t2's write preparation in t1's end_io handler. The way we make this happen is by taking advantage of the unwritten extent flag as an intermediate step. Recall that when we begin the process of writing data to shared blocks, we create a delayed allocation extent in the CoW fork: D: --RRRRRRSSSRRRRRRRR--- C: ------DDDDDDD--------- When a thread prepares to CoW some dirty data out to disk, it will now convert the delalloc reservation into an /unwritten/ allocated extent in the cow fork. The da conversion code tries to opportunistically allocate as much of a (speculatively prealloc'd) extent as possible, so we may end up allocating a larger extent than we're actually writing out: D: --RRRRRRSSSRRRRRRRR--- U: ------UUUUUUU--------- Next, we convert only the part of the extent that we're actively planning to write to normal (i.e. not unwritten) status: D: --RRRRRRSSSRRRRRRRR--- U: ------UURRUUU--------- If the write succeeds, the end_cow function will now scan the relevant range of the CoW fork for real extents and remap only the real extents into the data fork: D: --RRRRRRRRSRRRRRRRR--- U: ------UU--UUU--------- This ensures that we never obliterate valid data fork extents with unwritten blocks from the CoW fork. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_aops.c | 6 +++ fs/xfs/xfs_iomap.c | 2 +- fs/xfs/xfs_reflink.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_reflink.h | 2 + fs/xfs/xfs_trace.h | 8 +++- 5 files changed, 123 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 631e7c0e0a29..1ff9df7a3ce8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,6 +481,12 @@ xfs_submit_ioend( struct xfs_ioend *ioend, int status) { + /* Convert CoW extents to regular */ + if (!status && ioend->io_type == XFS_IO_COW) { + status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), + ioend->io_offset, ioend->io_size); + } + /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && ioend->io_type != XFS_IO_UNWRITTEN && diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 25ed98324b27..84fb8788431b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -685,7 +685,7 @@ xfs_iomap_write_allocate( int nres; if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK; + flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; /* * Make sure that the dquots are there. diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 07593a362cd0..8c8c4f4676da 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -82,11 +82,22 @@ * mappings are a reservation against the free space in the filesystem; * adjacent mappings can also be combined into fewer larger mappings. * + * As an optimization, the CoW extent size hint (cowextsz) creates + * outsized aligned delalloc reservations in the hope of landing out of + * order nearby CoW writes in a single extent on disk, thereby reducing + * fragmentation and improving future performance. + * + * D: --RRRRRRSSSRRRRRRRR--- (data fork) + * C: ------DDDDDDD--------- (CoW fork) + * * When dirty pages are being written out (typically in writepage), the - * delalloc reservations are converted into real mappings by allocating - * blocks and replacing the delalloc mapping with real ones. A delalloc - * mapping can be replaced by several real ones if the free space is - * fragmented. + * delalloc reservations are converted into unwritten mappings by + * allocating blocks and replacing the delalloc mapping with real ones. + * A delalloc mapping can be replaced by several unwritten ones if the + * free space is fragmented. + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUUUUUU--------- * * We want to adapt the delalloc mechanism for copy-on-write, since the * write paths are similar. The first two steps (creating the reservation @@ -101,13 +112,29 @@ * Block-aligned directio writes will use the same mechanism as buffered * writes. * + * Just prior to submitting the actual disk write requests, we convert + * the extents representing the range of the file actually being written + * (as opposed to extra pieces created for the cowextsize hint) to real + * extents. This will become important in the next step: + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUrrUUU--------- + * * CoW remapping must be done after the data block write completes, * because we don't want to destroy the old data fork map until we're sure * the new block has been written. Since the new mappings are kept in a * separate fork, we can simply iterate these mappings to find the ones * that cover the file blocks that we just CoW'd. For each extent, simply * unmap the corresponding range in the data fork, map the new range into - * the data fork, and remove the extent from the CoW fork. + * the data fork, and remove the extent from the CoW fork. Because of + * the presence of the cowextsize hint, however, we must be careful + * only to remap the blocks that we've actually written out -- we must + * never remap delalloc reservations nor CoW staging blocks that have + * yet to be written. This corresponds exactly to the real extents in + * the CoW fork: + * + * D: --RRRRRRrrSRRRRRRRR--- + * C: ------UU--UUU--------- * * Since the remapping operation can be applied to an arbitrary file * range, we record the need for the remap step as a flag in the ioend @@ -296,6 +323,65 @@ xfs_reflink_reserve_cow( return 0; } +/* Convert part of an unwritten CoW extent to a real one. */ +STATIC int +xfs_reflink_convert_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_defer_ops *dfops) +{ + struct xfs_bmbt_irec irec = *imap; + xfs_fsblock_t first_block; + int nimaps = 1; + + if (imap->br_state == XFS_EXT_NORM) + return 0; + + xfs_trim_extent(&irec, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, &irec); + if (irec.br_blockcount == 0) + return 0; + return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block, + 0, &irec, &nimaps, dfops); +} + +/* Convert all of the unwritten CoW extents in a file's range to real ones. */ +int +xfs_reflink_convert_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec got; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + xfs_extnum_t idx; + bool found; + int error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* Convert all the extents to real from unwritten. */ + for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); + found && got.br_startoff < end_fsb; + found = xfs_iext_get_extent(ifp, ++idx, &got)) { + error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, + end_fsb - offset_fsb, &dfops); + if (error) + break; + } + + /* Finish up. */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ static int __xfs_reflink_allocate_cow( @@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow( goto out_unlock; ASSERT(nimaps == 1); + /* Make sure there's a CoW reservation for it. */ error = xfs_reflink_reserve_cow(ip, &imap, &shared); if (error) goto out_trans_cancel; @@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow( goto out_trans_cancel; } + /* Allocate the entire reservation as unwritten blocks. */ xfs_trans_ijoin(tp, ip, 0); error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, - XFS_BMAPI_COWFORK, &first_block, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), &imap, &nimaps, &dfops); if (error) goto out_trans_cancel; + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_trans_cancel; @@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range( if (error) { trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_); - break; + return error; } } - return error; + /* Convert the CoW extents to regular. */ + return xfs_reflink_convert_cow(ip, offset, count); } /* @@ -641,6 +731,16 @@ xfs_reflink_end_cow( ASSERT(!isnullstartblock(got.br_startblock)); + /* + * Don't remap unwritten extents; these are + * speculatively preallocated CoW extents that have been + * allocated but have not yet been involved in a write. + */ + if (got.br_state == XFS_EXT_UNWRITTEN) { + idx--; + goto next_extent; + } + /* Unmap the old blocks in the data fork. */ xfs_defer_init(&dfops, &firstfsb); rlen = del.br_blockcount; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index aa6a4d64bd35..1583c4727cb1 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, struct xfs_bmbt_irec *imap); extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 643222784c3b..9e9bb9538bb6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3088,6 +3088,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __field(xfs_fileoff_t, lblk) __field(xfs_extlen_t, len) __field(xfs_fsblock_t, pblk) + __field(int, state) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -3095,13 +3096,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __entry->lblk = irec->br_startoff; __entry->len = irec->br_blockcount; __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->lblk, __entry->len, - __entry->pblk) + __entry->pblk, + __entry->state) ); #define DEFINE_INODE_IREC_EVENT(name) \ DEFINE_EVENT(xfs_inode_irec_class, name, \ @@ -3241,6 +3244,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_RW_EVENT(xfs_reflink_reserve_cow); DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); -- cgit v1.2.3 From 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 3 Feb 2017 14:39:07 -0800 Subject: xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t After successful IO or permanent error, b_first_retry_time also needs to be cleared, else the invalid first retry time will be used by the next retry check. Signed-off-by: Hou Tao Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2975cb2319f4..0306168af332 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks( */ bp->b_last_error = 0; bp->b_retries = 0; + bp->b_first_retry_time = 0; xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; -- cgit v1.2.3 From 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 13:00:54 -0800 Subject: xfs: reject all unaligned direct writes to reflinked files We currently fall back from direct to buffered writes if we detect a remaining shared extent in the iomap_begin callback. But by the time iomap_begin is called for the potentially unaligned end block we might have already written most of the data to disk, which we'd now write again using buffered I/O. To avoid this reject all writes to reflinked files before starting I/O so that we are guaranteed to only write the data once. The alternative would be to unshare the unaligned start and/or end block before doing the I/O. I think that's doable, and will actually be required to support reflinks on DAX file system. But it will take a little more time and I'd rather get rid of the double write ASAP. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 9 +++++++++ fs/xfs/xfs_iomap.c | 12 +----------- fs/xfs/xfs_trace.h | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 032c8a74824a..2a695a8f4fe7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -527,6 +527,15 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { unaligned_io = 1; + + /* + * We can't properly handle unaligned direct I/O to reflink + * files yet, as we can't unshare a partial block. + */ + if (xfs_is_reflink_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + return -EREMCHG; + } iolock = XFS_IOLOCK_EXCL; } else { iolock = XFS_IOLOCK_SHARED; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 84fb8788431b..52d9d1f61d4a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1026,17 +1026,7 @@ xfs_file_iomap_begin( if (error) goto out_unlock; - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If the - * extent is shared, fall back to buffered mode to handle the - * RMW. - */ - if (!(flags & IOMAP_REPORT) && shared) { - trace_xfs_reflink_bounce_dio_write(ip, &imap); - error = -EREMCHG; - goto out_unlock; - } + ASSERT((flags & IOMAP_REPORT) || !shared); } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9e9bb9538bb6..b654893130e2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3249,7 +3249,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_RW_EVENT(xfs_reflink_reserve_cow); DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); -DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); -- cgit v1.2.3 From f13eb2055ae46ded52961f27753c245dc5b8967d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 10:42:26 -0800 Subject: xfs: introduce xfs_aligned_fsb_count Factor a helper to calculate the extent-size aligned block out of the iomap code, so that it can be reused by the upcoming reflink dio code. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 11 ++--------- fs/xfs/xfs_iomap.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 52d9d1f61d4a..06bfc82dd00b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -162,7 +162,7 @@ xfs_iomap_write_direct( xfs_fileoff_t last_fsb; xfs_filblks_t count_fsb, resaligned; xfs_fsblock_t firstfsb; - xfs_extlen_t extsz, temp; + xfs_extlen_t extsz; int nimaps; int quota_flag; int rt; @@ -203,14 +203,7 @@ xfs_iomap_write_direct( } count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); - - resaligned = count_fsb; - if (unlikely(extsz)) { - if ((temp = do_mod(offset_fsb, extsz))) - resaligned += temp; - if ((temp = do_mod(resaligned, extsz))) - resaligned += extsz - temp; - } + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz); if (unlikely(rt)) { resrtextents = qblocks = resaligned; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 705224b66b6a..00db3ecea084 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -33,6 +33,26 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); +static inline xfs_filblks_t +xfs_aligned_fsb_count( + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + xfs_extlen_t extsz) +{ + if (extsz) { + xfs_extlen_t align; + + align = do_mod(offset_fsb, extsz); + if (align) + count_fsb += align; + align = do_mod(count_fsb, extsz); + if (align) + count_fsb += extsz - align; + } + + return count_fsb; +} + extern const struct iomap_ops xfs_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; -- cgit v1.2.3 From dcf9585a7511147c7ffd580be8580dd39bc52fb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 10:46:01 -0800 Subject: xfs: return the converted extent in __xfs_reflink_convert_cow We'll need it for the direct I/O code. Also rename the function to xfs_reflink_convert_cow_extent to describe it a bit better. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 8c8c4f4676da..219bc96bfc71 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -332,20 +332,19 @@ xfs_reflink_convert_cow_extent( xfs_filblks_t count_fsb, struct xfs_defer_ops *dfops) { - struct xfs_bmbt_irec irec = *imap; xfs_fsblock_t first_block; int nimaps = 1; if (imap->br_state == XFS_EXT_NORM) return 0; - xfs_trim_extent(&irec, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, &irec); - if (irec.br_blockcount == 0) + xfs_trim_extent(imap, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, imap); + if (imap->br_blockcount == 0) return 0; - return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount, + return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block, - 0, &irec, &nimaps, dfops); + 0, imap, &nimaps, dfops); } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ -- cgit v1.2.3 From a14234c72bf41ac96bc8c98e96e2c84b6d4bd4f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 10:50:49 -0800 Subject: xfs: go straight to real allocations for direct I/O COW writes When we allocate COW fork blocks for direct I/O writes we currently first create a delayed allocation, and then convert it to a real allocation once we've got the delayed one. As there is no good reason for that this patch instead makes use call xfs_bmapi_write from the COW allocation path. The only interesting bits are a few tweaks the low-level allocator to allow for this, most notably the need to remove the call to xfs_bmap_extsize_align for the cowextsize in xfs_bmap_btalloc - for the existing convert case it's a no-op, but for the direct allocation case it would blow up our block reservation way beyond what we reserved for the transaction. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 3 +- fs/xfs/xfs_reflink.c | 94 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 68 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index dcffbb09444e..1cee514de19e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2895,13 +2895,14 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; /* * Check and set flags if this segment has a left neighbor. diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 219bc96bfc71..9bba084c1436 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -389,62 +389,100 @@ __xfs_reflink_allocate_cow( xfs_fileoff_t end_fsb) { struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; + struct xfs_bmbt_irec imap, got; struct xfs_defer_ops dfops; struct xfs_trans *tp; xfs_fsblock_t first_block; - int nimaps = 1, error; - bool shared; + int nimaps, error, lockmode; + bool shared, trimmed; + xfs_filblks_t resaligned; + xfs_extlen_t resblks; + xfs_extnum_t idx; - xfs_defer_init(&dfops, &first_block); + resaligned = xfs_aligned_fsb_count(*offset_fsb, end_fsb - *offset_fsb, + xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, - XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); - /* Read extent from the source file. */ - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); - if (error) - goto out_unlock; - ASSERT(nimaps == 1); + /* + * Even if the extent is not shared we might have a preallocation for + * it in the COW fork. If so use it. + */ + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, *offset_fsb, &idx, &got) && + got.br_startoff <= *offset_fsb) { + /* If we have a real allocation in the COW fork we're done. */ + if (!isnullstartblock(got.br_startblock)) { + xfs_trim_extent(&got, *offset_fsb, + end_fsb - *offset_fsb); + *offset_fsb = got.br_startoff + got.br_blockcount; + goto out_trans_cancel; + } + } else { + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_trans_cancel; + ASSERT(nimaps == 1); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, + &trimmed); + if (error) + goto out_trans_cancel; + + if (!shared) { + *offset_fsb = imap.br_startoff + imap.br_blockcount; + goto out_trans_cancel; + } - /* Make sure there's a CoW reservation for it. */ - error = xfs_reflink_reserve_cow(ip, &imap, &shared); + *offset_fsb = imap.br_startoff; + end_fsb = imap.br_startoff + imap.br_blockcount; + } + + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; - if (!shared) { - *offset_fsb = imap.br_startoff + imap.br_blockcount; - goto out_trans_cancel; - } + xfs_trans_ijoin(tp, ip, 0); + + xfs_defer_init(&dfops, &first_block); + nimaps = 1; /* Allocate the entire reservation as unwritten blocks. */ - xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, + error = xfs_bmapi_write(tp, ip, *offset_fsb, end_fsb - *offset_fsb, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), - &imap, &nimaps, &dfops); + resblks, &imap, &nimaps, &dfops); if (error) - goto out_trans_cancel; + goto out_bmap_cancel; /* Finish up. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) - goto out_trans_cancel; + goto out_bmap_cancel; error = xfs_trans_commit(tp); + if (error) + goto out_unlock; *offset_fsb = imap.br_startoff + imap.br_blockcount; + out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; -out_trans_cancel: + +out_bmap_cancel: xfs_defer_cancel(&dfops); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, + XFS_QMOPT_RES_REGBLKS); +out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; } -- cgit v1.2.3 From 3c68d44a2b49a0ac9165faa9c191e1e618c8a8d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 10:51:03 -0800 Subject: xfs: allocate direct I/O COW blocks in iomap_begin Instead of preallocating all the required COW blocks in the high-level write code do it inside the iomap code, like we do for all other I/O. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 8 --- fs/xfs/xfs_iomap.c | 31 +++++------ fs/xfs/xfs_reflink.c | 146 +++++++++++++++++++-------------------------------- fs/xfs/xfs_reflink.h | 4 +- fs/xfs/xfs_trace.h | 2 - 5 files changed, 68 insertions(+), 123 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2a695a8f4fe7..086440e79b86 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -561,14 +561,6 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - - /* If this is a block-aligned directio CoW, remap immediately. */ - if (xfs_is_reflink_inode(ip) && !unaligned_io) { - ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); - if (ret) - goto out; - } - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 06bfc82dd00b..e0bc290396dc 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -995,37 +995,31 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); - if (xfs_is_reflink_inode(ip) && - (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { - shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); - if (shared) { - xfs_iunlock(ip, lockmode); - goto alloc_done; - } - ASSERT(!isnullstartblock(imap.br_startblock)); - } - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if ((flags & IOMAP_REPORT) || - (xfs_is_reflink_inode(ip) && - (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { + if (flags & IOMAP_REPORT) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; - - ASSERT((flags & IOMAP_REPORT) || !shared); } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { - error = xfs_reflink_reserve_cow(ip, &imap, &shared); - if (error) - goto out_unlock; + if (flags & IOMAP_DIRECT) { + /* may drop and re-acquire the ilock */ + error = xfs_reflink_allocate_cow(ip, &imap, &shared, + &lockmode); + if (error) + goto out_unlock; + } else { + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1054,7 +1048,6 @@ xfs_file_iomap_begin( if (error) return error; -alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 9bba084c1436..1f61877b065b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -382,74 +382,75 @@ xfs_reflink_convert_cow( } /* Allocate all CoW reservations covering a range of blocks in a file. */ -static int -__xfs_reflink_allocate_cow( +int +xfs_reflink_allocate_cow( struct xfs_inode *ip, - xfs_fileoff_t *offset_fsb, - xfs_fileoff_t end_fsb) + struct xfs_bmbt_irec *imap, + bool *shared, + uint *lockmode) { struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap, got; + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_bmbt_irec got; struct xfs_defer_ops dfops; - struct xfs_trans *tp; + struct xfs_trans *tp = NULL; xfs_fsblock_t first_block; - int nimaps, error, lockmode; - bool shared, trimmed; + int nimaps, error = 0; + bool trimmed; xfs_filblks_t resaligned; - xfs_extlen_t resblks; + xfs_extlen_t resblks = 0; xfs_extnum_t idx; - resaligned = xfs_aligned_fsb_count(*offset_fsb, end_fsb - *offset_fsb, - xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - if (error) - return error; - - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); +retry: + ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); /* * Even if the extent is not shared we might have a preallocation for * it in the COW fork. If so use it. */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, *offset_fsb, &idx, &got) && - got.br_startoff <= *offset_fsb) { + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) && + got.br_startoff <= offset_fsb) { + *shared = true; + /* If we have a real allocation in the COW fork we're done. */ if (!isnullstartblock(got.br_startblock)) { - xfs_trim_extent(&got, *offset_fsb, - end_fsb - *offset_fsb); - *offset_fsb = got.br_startoff + got.br_blockcount; - goto out_trans_cancel; + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + goto convert; } + + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); } else { - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); - if (error) - goto out_trans_cancel; - ASSERT(nimaps == 1); + error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + if (error || !*shared) + goto out; + } - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, - &trimmed); - if (error) - goto out_trans_cancel; + if (!tp) { + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - if (!shared) { - *offset_fsb = imap.br_startoff + imap.br_blockcount; - goto out_trans_cancel; - } + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); - *offset_fsb = imap.br_startoff; - end_fsb = imap.br_startoff + imap.br_blockcount; + if (error) + return error; + + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out; + goto retry; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out_trans_cancel; + goto out; xfs_trans_ijoin(tp, ip, 0); @@ -457,9 +458,9 @@ __xfs_reflink_allocate_cow( nimaps = 1; /* Allocate the entire reservation as unwritten blocks. */ - error = xfs_bmapi_write(tp, ip, *offset_fsb, end_fsb - *offset_fsb, + error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, - resblks, &imap, &nimaps, &dfops); + resblks, imap, &nimaps, &dfops); if (error) goto out_bmap_cancel; @@ -470,57 +471,18 @@ __xfs_reflink_allocate_cow( error = xfs_trans_commit(tp); if (error) - goto out_unlock; - - *offset_fsb = imap.br_startoff + imap.br_blockcount; - -out_unlock: - xfs_iunlock(ip, lockmode); - return error; - + return error; +convert: + return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb, + &dfops); out_bmap_cancel: xfs_defer_cancel(&dfops); xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); -out_trans_cancel: - xfs_trans_cancel(tp); - goto out_unlock; -} - -/* Allocate all CoW reservations covering a part of a file. */ -int -xfs_reflink_allocate_cow_range( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t count) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - int error; - - ASSERT(xfs_is_reflink_inode(ip)); - - trace_xfs_reflink_allocate_cow_range(ip, offset, count); - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - while (offset_fsb < end_fsb) { - error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); - if (error) { - trace_xfs_reflink_allocate_cow_range_error(ip, error, - _RET_IP_); - return error; - } - } - - /* Convert the CoW extents to regular. */ - return xfs_reflink_convert_cow(ip, offset, count); +out: + if (tp) + xfs_trans_cancel(tp); + return error; } /* diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 1583c4727cb1..33ac9b8db683 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -28,8 +28,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); -extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, - xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b654893130e2..fb7555e73a62 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3247,7 +3247,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_RW_EVENT(xfs_reflink_reserve_cow); -DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); @@ -3257,7 +3256,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -- cgit v1.2.3 From c5ecb42342852892f978572ddc6dca703460f25a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 17:45:51 -0800 Subject: xfs: update ctime and mtime on clone destinatation inodes We're changing both metadata and data, so we need to update the timestamps for clone operations. Dedupe on the other hand does not change file data, and only changes invisible metadata so the timestamps should not be updated. This follows existing btrfs behavior. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [darrick: remove redundant is_dedupe test] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 1f61877b065b..2f3d2d5c2934 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -954,13 +954,14 @@ STATIC int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, - xfs_extlen_t cowextsize) + xfs_extlen_t cowextsize, + bool is_dedupe) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -981,6 +982,10 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } + if (!is_dedupe) { + xfs_trans_ichgtime(tp, dest, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1294,7 +1299,8 @@ xfs_reflink_remap_range( !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) cowextsize = src->i_d.di_cowextsize; - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + is_dedupe); out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); -- cgit v1.2.3 From b20fe4730ea5c037c16631fb0df659c7b6d4b3b1 Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Tue, 7 Feb 2017 12:59:33 -0800 Subject: xfs: correct null checks and error processing in xfs_initialize_perag If pag cannot be allocated, the current error exit path will trip a null pointer deference error when calling xfs_buf_hash_destroy with a null pag. Fix this by adding a new error exit labels and jumping to those accordingly, avoiding the hash destroy and unnecessary kmem_free on pag. Up to three things need to be properly unwound: 1) pag memory allocation 2) xfs_buf_hash_init 3) radix_tree_insert For any given iteration through the loop, any of the above which succeed must be unwound for /this/ pag, and then all prior initialized pags must be unwound. Addresses-Coverity-Id: 1397628 ("Dereference after null check") Reported-by: Colin Ian King Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_mount.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9b9540db17a6..1f1e4ae44150 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -187,7 +187,7 @@ xfs_initialize_perag( xfs_agnumber_t *maxagi) { xfs_agnumber_t index; - xfs_agnumber_t first_initialised = 0; + xfs_agnumber_t first_initialised = NULLAGNUMBER; xfs_perag_t *pag; int error = -ENOMEM; @@ -202,22 +202,20 @@ xfs_initialize_perag( xfs_perag_put(pag); continue; } - if (!first_initialised) - first_initialised = index; pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); if (!pag) - goto out_unwind; + goto out_unwind_new_pags; pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) - goto out_unwind; + goto out_free_pag; if (radix_tree_preload(GFP_NOFS)) - goto out_unwind; + goto out_hash_destroy; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { @@ -225,10 +223,13 @@ xfs_initialize_perag( spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; - goto out_unwind; + goto out_hash_destroy; } spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); + /* first new pag is fully initialized */ + if (first_initialised == NULLAGNUMBER) + first_initialised = index; } index = xfs_set_inode_alloc(mp, agcount); @@ -239,11 +240,16 @@ xfs_initialize_perag( mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_unwind: +out_hash_destroy: xfs_buf_hash_destroy(pag); +out_free_pag: kmem_free(pag); - for (; index > first_initialised; index--) { +out_unwind_new_pags: + /* unwind any prior newly initialized pags */ + for (index = first_initialised; index < agcount; index++) { pag = radix_tree_delete(&mp->m_perag_tree, index); + if (!pag) + break; xfs_buf_hash_destroy(pag); kmem_free(pag); } -- cgit v1.2.3 From 5e30c23d13919a718b22d4921dc5c0accc59da27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Feb 2017 14:06:46 -0800 Subject: xfs: don't fail xfs_extent_busy allocation We don't just need the structure to track busy extents which can be avoided with a synchronous transaction, but also to keep track of pending discard. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extent_busy.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 162dc186cf04..29c2f997aedf 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -45,18 +45,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); new->agno = agno; new->bno = bno; new->length = len; -- cgit v1.2.3 From ebf55872616c7d4754db5a318591a72a8d5e6896 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Feb 2017 14:06:57 -0800 Subject: xfs: improve handling of busy extents in the low-level allocator Currently we force the log and simply try again if we hit a busy extent, but especially with online discard enabled it might take a while after the log force for the busy extents to disappear, and we might have already completed our second pass. So instead we add a new waitqueue and a generation counter to the pag structure so that we can do wakeups once we've removed busy extents, and we replace the single retry with an unconditional one - after all we hold the AGF buffer lock, so no other allocations or frees can be racing with us in this AG. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 93 ++++++++++++++++++---------------- fs/xfs/xfs_extent_busy.c | 125 ++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_extent_busy.h | 11 +++- fs/xfs/xfs_mount.c | 8 +++ fs/xfs/xfs_mount.h | 2 + 5 files changed, 166 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9f06a211e157..fe98fbc4adf1 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -221,20 +221,22 @@ xfs_alloc_get_rec( * Compute aligned version of the found extent. * Takes alignment and min length into account. */ -STATIC void +STATIC bool xfs_alloc_compute_aligned( xfs_alloc_arg_t *args, /* allocation argument structure */ xfs_agblock_t foundbno, /* starting block in found extent */ xfs_extlen_t foundlen, /* length in found extent */ xfs_agblock_t *resbno, /* result block number */ - xfs_extlen_t *reslen) /* result length */ + xfs_extlen_t *reslen, /* result length */ + unsigned *busy_gen) { - xfs_agblock_t bno; - xfs_extlen_t len; + xfs_agblock_t bno = foundbno; + xfs_extlen_t len = foundlen; xfs_extlen_t diff; + bool busy; /* Trim busy sections out of found extent */ - xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -259,6 +261,8 @@ xfs_alloc_compute_aligned( *resbno = bno; *reslen = len; } + + return busy; } /* @@ -737,10 +741,11 @@ xfs_alloc_ag_vextent_exact( int error; xfs_agblock_t fbno; /* start block of found extent */ xfs_extlen_t flen; /* length of found extent */ - xfs_agblock_t tbno; /* start block of trimmed extent */ - xfs_extlen_t tlen; /* length of trimmed extent */ - xfs_agblock_t tend; /* end block of trimmed extent */ + xfs_agblock_t tbno; /* start block of busy extent */ + xfs_extlen_t tlen; /* length of busy extent */ + xfs_agblock_t tend; /* end block of busy extent */ int i; /* success/failure of operation */ + unsigned busy_gen; ASSERT(args->alignment == 1); @@ -773,7 +778,9 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); + tbno = fbno; + tlen = flen; + xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -853,6 +860,7 @@ xfs_alloc_find_best_extent( xfs_agblock_t sdiff; int error; int i; + unsigned busy_gen; /* The good extent is perfect, no need to search. */ if (!gdiff) @@ -866,7 +874,8 @@ xfs_alloc_find_best_extent( if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + xfs_alloc_compute_aligned(args, *sbno, *slen, + sbnoa, slena, &busy_gen); /* * The good extent is closer than this one. @@ -955,7 +964,8 @@ xfs_alloc_ag_vextent_near( xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ - int forced = 0; + bool busy; + unsigned busy_gen; #ifdef DEBUG /* * Randomly don't execute the first algorithm. @@ -982,6 +992,7 @@ restart: ltlen = 0; gtlena = 0; ltlena = 0; + busy = false; /* * Get a cursor for the by-size btree. @@ -1064,8 +1075,8 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena); + busy = xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena, &busy_gen); if (ltlena < args->minlen) continue; if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) @@ -1183,8 +1194,8 @@ restart: if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena); + busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena, &busy_gen); if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) @@ -1199,8 +1210,8 @@ restart: if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, gtbno, gtlen, - >bnoa, >lena); + busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena, &busy_gen); if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) @@ -1261,9 +1272,9 @@ restart: if (bno_cur_lt == NULL && bno_cur_gt == NULL) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - if (!forced++) { + if (busy) { trace_xfs_alloc_near_busy(args); - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, args->pag, busy_gen); goto restart; } trace_xfs_alloc_size_neither(args); @@ -1344,7 +1355,8 @@ xfs_alloc_ag_vextent_size( int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ - int forced = 0; + bool busy; + unsigned busy_gen; restart: /* @@ -1353,6 +1365,7 @@ restart: cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; + busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1362,14 +1375,13 @@ restart: goto error0; /* - * If none or we have busy extents that we cannot allocate from, then - * we have to settle for a smaller extent. In the case that there are - * no large extents, this will return the last entry in the tree unless - * the tree is empty. In the case that there are only busy large - * extents, this will return the largest small extent unless there + * If none then we have to settle for a smaller extent. In the case that + * there are no large extents, this will return the last entry in the + * tree unless the tree is empty. In the case that there are only busy + * large extents, this will return the largest small extent unless there * are no smaller extents available. */ - if (!i || forced > 1) { + if (!i) { error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, &flen, &i); if (error) @@ -1380,13 +1392,11 @@ restart: return 0; } ASSERT(i == 1); - xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, + &rlen, &busy_gen); } else { /* * Search for a non-busy extent that is large enough. - * If we are at low space, don't check, or if we fall of - * the end of the btree, turn off the busy check and - * restart. */ for (;;) { error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); @@ -1394,8 +1404,8 @@ restart: goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, fbno, flen, - &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); if (rlen >= args->maxlen) break; @@ -1407,18 +1417,13 @@ restart: /* * Our only valid extents must have been busy. * Make it unbusy by forcing the log out and - * retrying. If we've been here before, forcing - * the log isn't making the extents available, - * which means they have probably been freed in - * this transaction. In that case, we have to - * give up on them and we'll attempt a minlen - * allocation the next time around. + * retrying. */ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - if (!forced++) - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, + args->pag, busy_gen); goto restart; } } @@ -1454,8 +1459,8 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; - xfs_alloc_compute_aligned(args, fbno, flen, - &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), @@ -1484,10 +1489,10 @@ restart: */ args->len = rlen; if (rlen < args->minlen) { - if (!forced++) { + if (busy) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, args->pag, busy_gen); goto restart; } goto out_nominleft; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 29c2f997aedf..4d850e27095e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -334,25 +334,31 @@ restart: * subset of the extent that is not busy. If *rlen is smaller than * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. + * + * Return the current busy generation for the AG if the extent is busy. This + * value can be used to wait for at least one of the currently busy extents + * to be cleared. Note that the busy list is not guaranteed to be empty after + * the gen is woken. The state of a specific extent must always be confirmed + * with another call to xfs_extent_busy_trim() before it can be used. */ -void +bool xfs_extent_busy_trim( struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) + xfs_agblock_t *bno, + xfs_extlen_t *len, + unsigned *busy_gen) { xfs_agblock_t fbno; xfs_extlen_t flen; struct rb_node *rbp; + bool ret = false; ASSERT(len > 0); spin_lock(&args->pag->pagb_lock); restart: - fbno = bno; - flen = len; + fbno = *bno; + flen = *len; rbp = args->pag->pagb_tree.rb_node; while (rbp && flen >= args->minlen) { struct xfs_extent_busy *busyp = @@ -504,24 +510,25 @@ restart: flen = fend - fbno; } - spin_unlock(&args->pag->pagb_lock); +out: - if (fbno != bno || flen != len) { - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + if (fbno != *bno || flen != *len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, fbno, flen); + *bno = fbno; + *len = flen; + *busy_gen = args->pag->pagb_gen; + ret = true; } - *rbno = fbno; - *rlen = flen; - return; + spin_unlock(&args->pag->pagb_lock); + return ret; fail: /* * Return a zero extent length as failure indications. All callers * re-check if the trimmed extent satisfies the minlen requirement. */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; + flen = 0; + goto out; } STATIC void @@ -540,6 +547,21 @@ xfs_extent_busy_clear_one( kmem_free(busyp); } +static void +xfs_extent_busy_put_pag( + struct xfs_perag *pag, + bool wakeup) + __releases(pag->pagb_lock) +{ + if (wakeup) { + pag->pagb_gen++; + wake_up_all(&pag->pagb_wait); + } + + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + /* * Remove all extents on the passed in list from the busy extents tree. * If do_discard is set skip extents that need to be discarded, and mark @@ -554,27 +576,76 @@ xfs_extent_busy_clear( struct xfs_extent_busy *busyp, *n; struct xfs_perag *pag = NULL; xfs_agnumber_t agno = NULLAGNUMBER; + bool wakeup = false; list_for_each_entry_safe(busyp, n, list, list) { if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); + if (pag) + xfs_extent_busy_put_pag(pag, wakeup); agno = busyp->agno; + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + wakeup = false; } if (do_discard && busyp->length && - !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; - else + } else { xfs_extent_busy_clear_one(mp, pag, busyp); + wakeup = true; + } } - if (pag) { - spin_unlock(&pag->pagb_lock); + if (pag) + xfs_extent_busy_put_pag(pag, wakeup); +} + +/* + * Flush out all busy extents for this AG. + */ +void +xfs_extent_busy_flush( + struct xfs_mount *mp, + struct xfs_perag *pag, + unsigned busy_gen) +{ + DEFINE_WAIT (wait); + int log_flushed = 0, error; + + trace_xfs_log_force(mp, 0, _THIS_IP_); + error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); + if (error) + return; + + do { + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); + if (busy_gen != READ_ONCE(pag->pagb_gen)) + break; + schedule(); + } while (1); + + finish_wait(&pag->pagb_wait, &wait); +} + +void +xfs_extent_busy_wait_all( + struct xfs_mount *mp) +{ + DEFINE_WAIT (wait); + xfs_agnumber_t agno; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_perag *pag = xfs_perag_get(mp, agno); + + do { + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); + if (RB_EMPTY_ROOT(&pag->pagb_tree)) + break; + schedule(); + } while (1); + finish_wait(&pag->pagb_wait, &wait); + xfs_perag_put(pag); } } diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index bfff284d2dcc..60195ea1b84a 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -58,9 +58,16 @@ void xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); +bool +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, + xfs_extlen_t *len, unsigned *busy_gen); + +void +xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, + unsigned busy_gen); + void -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, - xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); +xfs_extent_busy_wait_all(struct xfs_mount *mp); int xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1f1e4ae44150..566eb79666ad 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,6 +45,7 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_reflink.h" +#include "xfs_extent_busy.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -213,6 +214,7 @@ xfs_initialize_perag( INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) goto out_free_pag; + init_waitqueue_head(&pag->pagb_wait); if (radix_tree_preload(GFP_NOFS)) goto out_hash_destroy; @@ -1078,6 +1080,12 @@ xfs_unmountfs( */ xfs_log_force(mp, XFS_LOG_SYNC); + /* + * Wait for all busy extents to be freed, including completion of + * any discard operation. + */ + xfs_extent_busy_wait_all(mp); + /* * We now need to tell the world we are unmounting. This will allow * us to detect that the filesystem is going away and we should error diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7f351f706b7a..20e2981a4aec 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -384,6 +384,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_rightrec; spinlock_t pagb_lock; /* lock for pagb_tree */ struct rb_root pagb_tree; /* ordered tree of busy extents */ + unsigned int pagb_gen; /* generation count for pagb_tree */ + wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ -- cgit v1.2.3 From 46694129816851cb323e4bcf2dc41482d8d6c19b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Feb 2017 14:07:57 -0800 Subject: xfs: improve busy extent sorting Sort busy extents by the full block number instead of just the AGNO so that we can issue consecutive discard requests that the block layer could merge (although we'll need additional block layer fixes for fast devices). Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extent_busy.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 4d850e27095e..ab062610234e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -656,9 +656,17 @@ xfs_extent_busy_wait_all( int xfs_extent_busy_ag_cmp( void *priv, - struct list_head *a, - struct list_head *b) + struct list_head *l1, + struct list_head *l2) { - return container_of(a, struct xfs_extent_busy, list)->agno - - container_of(b, struct xfs_extent_busy, list)->agno; + struct xfs_extent_busy *b1 = + container_of(l1, struct xfs_extent_busy, list); + struct xfs_extent_busy *b2 = + container_of(l2, struct xfs_extent_busy, list); + s32 diff; + + diff = b1->agno - b2->agno; + if (!diff) + diff = b1->bno - b2->bno; + return diff; } -- cgit v1.2.3 From 4560e78f40cb55bd2ea8f1ef4001c5baa88531c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Feb 2017 14:07:58 -0800 Subject: xfs: don't block the log commit handler for discards Instead we submit the discard requests and use another workqueue to release the extents from the extent busy list. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_discard.c | 29 ------------------ fs/xfs/xfs_discard.h | 1 - fs/xfs/xfs_log_cil.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_super.c | 8 +++++ fs/xfs/xfs_super.h | 2 ++ 7 files changed, 88 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 4ff499aa7338..d796ffac7296 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -208,32 +208,3 @@ xfs_ioc_trim( return -EFAULT; return 0; } - -int -xfs_discard_extents( - struct xfs_mount *mp, - struct list_head *list) -{ - struct xfs_extent_busy *busyp; - int error = 0; - - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - return error; - } - } - - return 0; -} diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index 344879aea646..0f070f9e44e1 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -5,6 +5,5 @@ struct fstrim_range; struct list_head; extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); -extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index a4ab192e1792..82f1cbcc4de1 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -30,6 +30,9 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_log_priv.h" +#include "xfs_trace.h" + +struct workqueue_struct *xfs_discard_wq; /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -491,6 +494,75 @@ xlog_cil_free_logvec( } } +static void +xlog_discard_endio_work( + struct work_struct *work) +{ + struct xfs_cil_ctx *ctx = + container_of(work, struct xfs_cil_ctx, discard_endio_work); + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); + kmem_free(ctx); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. Note that we need a unbounded workqueue, otherwise we might + * get the execution delayed up to 30 seconds for weird reasons. + */ +static void +xlog_discard_endio( + struct bio *bio) +{ + struct xfs_cil_ctx *ctx = bio->bi_private; + + INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); + queue_work(xfs_discard_wq, &ctx->discard_endio_work); +} + +static void +xlog_discard_busy_extents( + struct xfs_mount *mp, + struct xfs_cil_ctx *ctx) +{ + struct list_head *list = &ctx->busy_extents; + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + blk_start_plug(&plug); + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = ctx; + bio->bi_end_io = xlog_discard_endio; + submit_bio(bio); + } else { + xlog_discard_endio_work(&ctx->discard_endio_work); + } + blk_finish_plug(&plug); +} + /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -525,14 +597,10 @@ xlog_cil_committed( xlog_cil_free_logvec(ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) { - ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); - - xfs_discard_extents(mp, &ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - } - - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents)) + xlog_discard_busy_extents(mp, ctx); + else + kmem_free(ctx); } /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2b6eec52178e..c2604a5366f2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -257,6 +257,7 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct xfs_log_callback log_cb; /* completion callback hook. */ struct list_head committing; /* ctx committing list */ + struct work_struct discard_endio_work; }; /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 566eb79666ad..450bde68bb75 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1085,6 +1085,7 @@ xfs_unmountfs( * any discard operation. */ xfs_extent_busy_wait_all(mp); + flush_workqueue(xfs_discard_wq); /* * We now need to tell the world we are unmounting. This will allow diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index eecbaac08eba..890862f2447c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1956,12 +1956,20 @@ xfs_init_workqueues(void) if (!xfs_alloc_wq) return -ENOMEM; + xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0); + if (!xfs_discard_wq) + goto out_free_alloc_wq; + return 0; +out_free_alloc_wq: + destroy_workqueue(xfs_alloc_wq); + return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { + destroy_workqueue(xfs_discard_wq); destroy_workqueue(xfs_alloc_wq); } diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index b6418abd85ad..5f2f32408011 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -73,6 +73,8 @@ extern const struct quotactl_ops xfs_quotactl_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); +extern struct workqueue_struct *xfs_discard_wq; + #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ -- cgit v1.2.3 From fa7f138ac4c70dc00519c124cf7cd4862a0a5b0e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 16 Feb 2017 17:19:12 -0800 Subject: xfs: clear delalloc and cache on buffered write failure The buffered write failure handling code in xfs_file_iomap_end_delalloc() has a couple minor problems. First, if written == 0, start_fsb is not rounded down and it fails to kill off a delalloc block if the start offset is block unaligned. This results in a lingering delalloc block and broken delalloc block accounting detected at unmount time. Fix this by rounding down start_fsb in the unlikely event that written == 0. Second, it is possible for a failed overwrite of a delalloc extent to leave dirty pagecache around over a hole in the file. This is because is possible to hit ->iomap_end() on write failure before the iomap code has attempted to allocate pagecache, and thus has no need to clean it up. If the targeted delalloc extent was successfully written by a previous write, however, then it does still have dirty pages when ->iomap_end() punches out the underlying blocks. This ultimately results in writeback over a hole. To fix this problem, unconditionally punch out the pagecache from XFS before the associated delalloc range. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e0bc290396dc..4009e7c8b52b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1078,7 +1078,15 @@ xfs_file_iomap_end_delalloc( xfs_fileoff_t end_fsb; int error = 0; - start_fsb = XFS_B_TO_FSB(mp, offset + written); + /* + * start_fsb refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_fsb = XFS_B_TO_FSBT(mp, offset); + else + start_fsb = XFS_B_TO_FSB(mp, offset + written); end_fsb = XFS_B_TO_FSB(mp, offset + length); /* @@ -1090,6 +1098,9 @@ xfs_file_iomap_end_delalloc( * blocks in the range, they are ours. */ if (start_fsb < end_fsb) { + truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), + XFS_FSB_TO_B(mp, end_fsb) - 1); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, end_fsb - start_fsb); -- cgit v1.2.3 From 9dbddd7b0c649bd6aa9442c717932325ec590303 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 13 Feb 2017 22:48:17 -0800 Subject: xfs: resurrect debug mode drop buffered writes mechanism A debug mode write failure mechanism was introduced to XFS in commit 801cc4e17a ("xfs: debug mode forced buffered write failure") to facilitate targeted testing of delalloc indirect reservation management from userspace. This code was subsequently rendered ineffective by the move to iomap based buffered writes in commit 68a9f5e700 ("xfs: implement iomap based buffered write path"). This likely went unnoticed because the associated userspace code had not made it into xfstests. Resurrect this mechanism to facilitate effective indlen reservation testing from xfstests. The move to iomap based buffered writes relocated the hook this mechanism needs to return write failure from XFS to generic code. The failure trigger must remain in XFS. Given that limitation, convert this from a write failure mechanism to one that simply drops writes without returning failure to userspace. Rename all "fail_writes" references to "drop_writes" to illustrate the point. This is more hacky than preferred, but still triggers the XFS error handling behavior required to drive the indlen tests. This is only available in DEBUG mode and for testing purposes only. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 4 ++++ fs/xfs/xfs_mount.h | 15 ++++++++------- fs/xfs/xfs_sysfs.c | 14 +++++++------- 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 4009e7c8b52b..41662fb14e87 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1078,6 +1078,10 @@ xfs_file_iomap_end_delalloc( xfs_fileoff_t end_fsb; int error = 0; + /* behave as if the write failed if drop writes is enabled */ + if (xfs_mp_drop_writes(mp)) + written = 0; + /* * start_fsb refers to the first unused block after a short write. If * nothing was written, round offset down to point at the first block in diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 20e2981a4aec..6db6fd6b82b0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -200,11 +200,12 @@ typedef struct xfs_mount { /* * DEBUG mode instrumentation to test and/or trigger delayed allocation * block killing in the event of failed writes. When enabled, all - * buffered writes are forced to fail. All delalloc blocks in the range - * of the write (including pre-existing delalloc blocks!) are tossed as - * part of the write failure error handling sequence. + * buffered writes are silenty dropped and handled as if they failed. + * All delalloc blocks in the range of the write (including pre-existing + * delalloc blocks!) are tossed as part of the write failure error + * handling sequence. */ - bool m_fail_writes; + bool m_drop_writes; #endif } xfs_mount_t; @@ -325,13 +326,13 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) #ifdef DEBUG static inline bool -xfs_mp_fail_writes(struct xfs_mount *mp) +xfs_mp_drop_writes(struct xfs_mount *mp) { - return mp->m_fail_writes; + return mp->m_drop_writes; } #else static inline bool -xfs_mp_fail_writes(struct xfs_mount *mp) +xfs_mp_drop_writes(struct xfs_mount *mp) { return 0; } diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index de6195e38910..80ac15fb9638 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -93,7 +93,7 @@ to_mp(struct kobject *kobject) #ifdef DEBUG STATIC ssize_t -fail_writes_store( +drop_writes_store( struct kobject *kobject, const char *buf, size_t count) @@ -107,9 +107,9 @@ fail_writes_store( return ret; if (val == 1) - mp->m_fail_writes = true; + mp->m_drop_writes = true; else if (val == 0) - mp->m_fail_writes = false; + mp->m_drop_writes = false; else return -EINVAL; @@ -117,21 +117,21 @@ fail_writes_store( } STATIC ssize_t -fail_writes_show( +drop_writes_show( struct kobject *kobject, char *buf) { struct xfs_mount *mp = to_mp(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0); + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0); } -XFS_SYSFS_ATTR_RW(fail_writes); +XFS_SYSFS_ATTR_RW(drop_writes); #endif /* DEBUG */ static struct attribute *xfs_mp_attrs[] = { #ifdef DEBUG - ATTR_LIST(fail_writes), + ATTR_LIST(drop_writes), #endif NULL, }; -- cgit v1.2.3 From 0e339ef8556d9e567aa7925f8892c263d79430d9 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 13 Feb 2017 22:48:18 -0800 Subject: xfs: handle indlen shortage on delalloc extent merge When a delalloc extent is created, it can be merged with pre-existing, contiguous, delalloc extents. When this occurs, xfs_bmap_add_extent_hole_delay() merges the extents along with the associated indirect block reservations. The expectation here is that the combined worst case indlen reservation is always less than or equal to the indlen reservation for the individual extents. This is not always the case, however, as existing extents can less than the expected indlen reservation if the extent was previously split due to a hole punch. If a new extent merges with such an extent, the total indlen requirement may be larger than the sum of the indlen reservations held by both extents. xfs_bmap_add_extent_hole_delay() assumes that the worst case indlen reservation is always available and assigns it to the merged extent without consideration for the indlen held by the pre-existing extent. As a result, the subsequent xfs_mod_fdblocks() call can attempt an unintentional allocation rather than a free (indicated by an ASSERT() failure). Further, if the allocation happens to fail in this context, the failure goes unhandled and creates a filesystem wide block accounting inconsistency. Fix xfs_bmap_add_extent_hole_delay() to function as designed. Cap the indlen reservation assigned to the merged extent to the sum of the indlen reservations held by each of the individual extents. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1cee514de19e..73c95466c225 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2805,7 +2805,8 @@ xfs_bmap_add_extent_hole_delay( oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2826,7 +2827,8 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2842,7 +2844,8 @@ xfs_bmap_add_extent_hole_delay( temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); -- cgit v1.2.3 From 75d65361cf3c0dae2af970c305e19c727b28a510 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 13 Feb 2017 22:48:30 -0800 Subject: xfs: split indlen reservations fairly when under reserved Certain workoads that punch holes into speculative preallocation can cause delalloc indirect reservation splits when the delalloc extent is split in two. If further splits occur, an already short-handed extent can be split into two in a manner that leaves zero indirect blocks for one of the two new extents. This occurs because the shortage is large enough that the xfs_bmap_split_indlen() algorithm completely drains the requested indlen of one of the extents before it honors the existing reservation. This ultimately results in a warning from xfs_bmap_del_extent(). This has been observed during file copies of large, sparse files using 'cp --sparse=always.' To avoid this problem, update xfs_bmap_split_indlen() to explicitly apply the reservation shortage fairly between both extents. This smooths out the overall indlen shortage and defers the situation where we end up with a delalloc extent with zero indlen reservation to extreme circumstances. Reported-by: Patrick Dung Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 61 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 73c95466c225..2ae55db8c977 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4795,34 +4795,59 @@ xfs_bmap_split_indlen( xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ xfs_filblks_t stolen = 0; + xfs_filblks_t resfactor; /* * Steal as many blocks as we can to try and satisfy the worst case * indlen for both new extents. */ - while (nres > ores && avail) { - nres--; - avail--; - stolen++; - } + if (ores < nres && avail) + stolen = XFS_FILBLKS_MIN(nres - ores, avail); + ores += stolen; + + /* nothing else to do if we've satisfied the new reservation */ + if (ores >= nres) + return stolen; + + /* + * We can't meet the total required reservation for the two extents. + * Calculate the percent of the overall shortage between both extents + * and apply this percentage to each of the requested indlen values. + * This distributes the shortage fairly and reduces the chances that one + * of the two extents is left with nothing when extents are repeatedly + * split. + */ + resfactor = (ores * 100); + do_div(resfactor, nres); + len1 *= resfactor; + do_div(len1, 100); + len2 *= resfactor; + do_div(len2, 100); + ASSERT(len1 + len2 <= ores); + ASSERT(len1 < *indlen1 && len2 < *indlen2); /* - * The only blocks available are those reserved for the original - * extent and what we can steal from the extent being removed. - * If this still isn't enough to satisfy the combined - * requirements for the two new extents, skim blocks off of each - * of the new reservations until they match what is available. + * Hand out the remainder to each extent. If one of the two reservations + * is zero, we want to make sure that one gets a block first. The loop + * below starts with len1, so hand len2 a block right off the bat if it + * is zero. */ - while (nres > ores) { - if (len1) { - len1--; - nres--; + ores -= (len1 + len2); + ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); + if (ores && !len2 && *indlen2) { + len2++; + ores--; + } + while (ores) { + if (len1 < *indlen1) { + len1++; + ores--; } - if (nres == ores) + if (!ores) break; - if (len2) { - len2--; - nres--; + if (len2 < *indlen2) { + len2++; + ores--; } } -- cgit v1.2.3 From 93aaead52a9eebdc20dc8fa673c350e592a06949 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 13 Feb 2017 22:52:27 -0800 Subject: xfs: fix uninitialized variable in _reflink_convert_cow Fix an uninitialize variable. Reported-by: Dan Carpenter Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 2f3d2d5c2934..da6d08fb359c 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -362,7 +362,7 @@ xfs_reflink_convert_cow( xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_extnum_t idx; bool found; - int error; + int error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); -- cgit v1.2.3 From 353fe445f5efd58dfec6e06bf4747a9c28374adc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Feb 2017 18:16:50 -0800 Subject: xfs: fix len comparison in xfs_extent_busy_trim The length is now passed by reference, so the assertion has to be updated to match the other changes, as pointed out by this W=1 warning: fs/xfs/xfs_extent_busy.c: In function 'xfs_extent_busy_trim': fs/xfs/xfs_extent_busy.c:356:13: error: ordered comparison of pointer with integer zero [-Werror=extra] Fixes: ebf55872616c ("xfs: improve handling of busy extents in the low-level allocator") Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extent_busy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ab062610234e..77760dbf0242 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -353,7 +353,7 @@ xfs_extent_busy_trim( struct rb_node *rbp; bool ret = false; - ASSERT(len > 0); + ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); restart: -- cgit v1.2.3 From 48af96ab92bc68fb645068b978ce36df2379e076 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 15 Feb 2017 10:18:10 -0800 Subject: xfs: don't reserve blocks for right shift transactions The block reservation for the transaction allocated in xfs_shift_file_space() is an artifact of the original collapse range support. It exists to handle the case where a collapse range occurs, the initial extent is left shifted into a location that forms a contiguous boundary with the previous extent and thus the extents are merged. This code was subsequently refactored and reused for insert range (right shift) support. If an insert range occurs under low free space conditions, the extent at the starting offset is split before the first shift transaction is allocated. If the block reservation fails, this leaves separate, but contiguous extents around in the inode. While not a fatal problem, this is unexpected and will flag a warning on subsequent insert range operations on the inode. This problem has been reproduce intermittently by generic/270 running against a ramdisk device. Since right shift does not create new extent boundaries in the inode, a block reservation for extent merge is unnecessary. Update xfs_shift_file_space() to conditionally reserve fs blocks for left shift transactions only. This avoids the warning reproduced by generic/270. Reported-by: Ross Zwisler Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 7c3bfafffba8..6be5f26783a1 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1385,10 +1385,16 @@ xfs_shift_file_space( xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; + uint resblks; ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); if (direction == SHIFT_LEFT) { + /* + * Reserve blocks to cover potential extent merges after left + * shift operations. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); next_fsb = XFS_B_TO_FSB(mp, offset + len); stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); } else { @@ -1396,6 +1402,7 @@ xfs_shift_file_space( * If right shift, delegate the work of initialization of * next_fsb to xfs_bmap_shift_extent as it has ilock held. */ + resblks = 0; next_fsb = NULLFSBLOCK; stop_fsb = XFS_B_TO_FSB(mp, offset); } @@ -1437,21 +1444,14 @@ xfs_shift_file_space( } while (!error && !done) { - /* - * We would need to reserve permanent block for transaction. - * This will come into picture when after shifting extent into - * hole we found that adjacent extents can be merged which - * may lead to freeing of a block during record update. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, + &tp); if (error) break; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + ip->i_gdquot, ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; -- cgit v1.2.3 From 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Thu, 16 Feb 2017 17:12:16 -0800 Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment On a ppc64 system, executing generic/256 test with 32k block size gives the following call trace, XFS: Assertion failed: args->maxlen > 0, file: /root/repos/linux/fs/xfs/libxfs/xfs_alloc.c, line: 2026 kernel BUG at /root/repos/linux/fs/xfs/xfs_message.c:113! Oops: Exception in kernel mode, sig: 5 [#1] SMP NR_CPUS=2048 DEBUG_PAGEALLOC NUMA pSeries Modules linked in: CPU: 2 PID: 19361 Comm: mkdir Not tainted 4.10.0-rc5 #58 task: c000000102606d80 task.stack: c0000001026b8000 NIP: c0000000004ef798 LR: c0000000004ef798 CTR: c00000000082b290 REGS: c0000001026bb090 TRAP: 0700 Not tainted (4.10.0-rc5) MSR: 8000000000029032 CR: 28004428 XER: 00000000 CFAR: c0000000004ef180 SOFTE: 1 GPR00: c0000000004ef798 c0000001026bb310 c000000001157300 ffffffffffffffea GPR04: 000000000000000a c0000001026bb130 0000000000000000 ffffffffffffffc0 GPR08: 00000000000000d1 0000000000000021 00000000ffffffd1 c000000000dd4990 GPR12: 0000000022004444 c00000000fe00800 0000000020000000 0000000000000000 GPR16: 0000000000000000 0000000043a606fc 0000000043a76c08 0000000043a1b3d0 GPR20: 000001002a35cd60 c0000001026bbb80 0000000000000000 0000000000000001 GPR24: 0000000000000240 0000000000000004 c00000062dc55000 0000000000000000 GPR28: 0000000000000004 c00000062ecd9200 0000000000000000 c0000001026bb6c0 NIP [c0000000004ef798] .assfail+0x28/0x30 LR [c0000000004ef798] .assfail+0x28/0x30 Call Trace: [c0000001026bb310] [c0000000004ef798] .assfail+0x28/0x30 (unreliable) [c0000001026bb380] [c000000000455d74] .xfs_alloc_space_available+0x194/0x1b0 [c0000001026bb410] [c00000000045b914] .xfs_alloc_fix_freelist+0x144/0x480 [c0000001026bb580] [c00000000045c368] .xfs_alloc_vextent+0x698/0xa90 [c0000001026bb650] [c0000000004a6200] .xfs_ialloc_ag_alloc+0x170/0x820 [c0000001026bb7c0] [c0000000004a9098] .xfs_dialloc+0x158/0x320 [c0000001026bb8a0] [c0000000004e628c] .xfs_ialloc+0x7c/0x610 [c0000001026bb990] [c0000000004e8138] .xfs_dir_ialloc+0xa8/0x2f0 [c0000001026bbaa0] [c0000000004e8814] .xfs_create+0x494/0x790 [c0000001026bbbf0] [c0000000004e5ebc] .xfs_generic_create+0x2bc/0x410 [c0000001026bbce0] [c0000000002b4a34] .vfs_mkdir+0x154/0x230 [c0000001026bbd70] [c0000000002bc444] .SyS_mkdirat+0x94/0x120 [c0000001026bbe30] [c00000000000b760] system_call+0x38/0xfc Instruction dump: 4e800020 60000000 7c0802a6 7c862378 3c82ffca 7ca72b78 38841c18 7c651b78 38600000 f8010010 f821ff91 4bfff94d <0fe00000> 60000000 7c0802a6 7c892378 When block size is larger than inode cluster size, the call to XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs would have set xfs_sb->sb_inoalignmt to 0. This causes xfs_ialloc_cluster_alignment() to return 0. Due to this args.minalignslop (in xfs_ialloc_ag_alloc()) gets the unsigned equivalent of -1 assigned to it. This later causes alloc_len in xfs_alloc_space_available() to have a value of 0. In such a scenario when args.total is also 0, the assert statement "ASSERT(args->maxlen > 0);" fails. This commit fixes the bug by replacing the call to XFS_B_TO_FSBT() in xfs_ialloc_cluster_alignment() with a call to xfs_icluster_size_fsb(). Suggested-by: Darrick J. Wong Signed-off-by: Chandan Rajendra Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f272abff11e1..d41ade5d293e 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment( struct xfs_mount *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) return mp->m_sb.sb_inoalignmt; return 1; } -- cgit v1.2.3 From 410d17f67e583559be3a922f8b6cc336331893f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Feb 2017 17:12:51 -0800 Subject: xfs: tune down agno asserts in the bmap code In various places we currently assert that xfs_bmap_btalloc allocates from the same as the firstblock value passed in, unless it's either NULLAGNO or the dop_low flag is set. But the reflink code does not fully follow this convention as it passes in firstblock purely as a hint for the allocator without actually having previous allocations in the transaction, and without having a minleft check on the current AG, leading to the assert firing on a very full and heavily used file system. As even the reflink code only allocates from equal or higher AGs for now we can simply the check to always allow for equal or higher AGs. Note that we need to eventually split the two meanings of the firstblock value. At that point we can also allow the reflink code to allocate from any AG instead of limiting it in any way. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2ae55db8c977..a9c66d47757a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -798,9 +798,7 @@ try_another_ag: */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (dfops->dop_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; @@ -3822,17 +3820,13 @@ xfs_bmap_btalloc( * the first block that was allocated. */ ASSERT(*ap->firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *ap->firstblock) == - XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *ap->firstblock) < - XFS_FSB_TO_AGNO(mp, args.fsbno))); + XFS_FSB_TO_AGNO(mp, *ap->firstblock) <= + XFS_FSB_TO_AGNO(mp, args.fsbno)); ap->blkno = args.fsbno; if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; - ASSERT(nullfb || fb_agno == args.agno || - (ap->dfops->dop_low && fb_agno < args.agno)); + ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; if (!(ap->flags & XFS_BMAPI_COWFORK)) ap->ip->i_d.di_nblocks += args.len; @@ -4754,13 +4748,9 @@ error0: if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, *firstblock) <= XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock) || - (dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock))); + bma.cur->bc_private.b.firstblock)); *firstblock = bma.cur->bc_private.b.firstblock; } xfs_btree_del_cursor(bma.cur, -- cgit v1.2.3 From 089ec2f87578b9740f0c27bcea9cc6be59c1ddb0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Feb 2017 08:21:06 -0800 Subject: xfs: simplify xfs_rtallocate_extent We can deduce the allocation type from the bno argument, and do the return without prod much simpler internally. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [darrick: fix the macro for the non-rt build] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 13 ++++--------- fs/xfs/xfs_rtalloc.c | 24 ++++++++---------------- fs/xfs/xfs_rtalloc.h | 3 +-- 3 files changed, 13 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6be5f26783a1..8b75dcea5966 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -88,7 +88,6 @@ int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - xfs_alloctype_t atype = 0; /* type for allocation routines */ int error; /* error return value */ xfs_mount_t *mp; /* mount point structure */ xfs_extlen_t prod = 0; /* product factor for allocators */ @@ -155,18 +154,14 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; do_div(ap->blkno, mp->m_sb.sb_rextsize); rtb = ap->blkno; ap->length = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, - &ralen, atype, ap->wasdel, prod, &rtb))) - return error; - if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, - ap->length, &ralen, atype, - ap->wasdel, 1, &rtb))) + error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, ap->wasdel, prod, &rtb); + if (error) return error; + ap->blkno = rtb; if (ap->blkno != NULLFSBLOCK) { ap->blkno *= mp->m_sb.sb_rextsize; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 802bcc326d9f..c57aa7f18087 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1093,7 +1093,6 @@ xfs_rtallocate_extent( xfs_extlen_t minlen, /* minimum length to allocate */ xfs_extlen_t maxlen, /* maximum length to allocate */ xfs_extlen_t *len, /* out: actual length allocated */ - xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ int wasdel, /* was a delayed allocation extent */ xfs_extlen_t prod, /* extent product factor */ xfs_rtblock_t *rtblock) /* out: start block allocated */ @@ -1123,27 +1122,16 @@ xfs_rtallocate_extent( } } +retry: sumbp = NULL; - /* - * Allocate by size, or near another block, or exactly at some block. - */ - switch (type) { - case XFS_ALLOCTYPE_ANY_AG: + if (bno == 0) { error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, &sumbp, &sb, prod, &r); - break; - case XFS_ALLOCTYPE_NEAR_BNO: + } else { error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, len, &sumbp, &sb, prod, &r); - break; - case XFS_ALLOCTYPE_THIS_BNO: - error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, - len, &sumbp, &sb, prod, &r); - break; - default: - error = -EIO; - ASSERT(0); } + if (error) return error; @@ -1158,7 +1146,11 @@ xfs_rtallocate_extent( xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); else xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); + } else if (prod > 1) { + prod = 1; + goto retry; } + *rtblock = r; return 0; } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 355dd9e1cb64..51dd3c726608 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -40,7 +40,6 @@ xfs_rtallocate_extent( xfs_extlen_t minlen, /* minimum length to allocate */ xfs_extlen_t maxlen, /* maximum length to allocate */ xfs_extlen_t *len, /* out: actual length allocated */ - xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ int wasdel, /* was a delayed allocation extent */ xfs_extlen_t prod, /* extent product factor */ xfs_rtblock_t *rtblock); /* out: start block allocated */ @@ -122,7 +121,7 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, #else -# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) +# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) -- cgit v1.2.3 From 8d242e932fb7660c24b3a534197e69c241067e0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Feb 2017 08:21:15 -0800 Subject: xfs: remove XFS_ALLOCTYPE_ANY_AG and XFS_ALLOCTYPE_START_AG XFS_ALLOCTYPE_ANY_AG was only used for the RT allocator and is unused now, and XFS_ALLOCTYPE_START_AG has been unused for a while. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 16 ++-------------- fs/xfs/libxfs/xfs_alloc.h | 4 ---- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fe98fbc4adf1..369adcc18c02 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2664,21 +2664,11 @@ xfs_alloc_vextent( args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); args->type = XFS_ALLOCTYPE_NEAR_BNO; /* FALLTHROUGH */ - case XFS_ALLOCTYPE_ANY_AG: - case XFS_ALLOCTYPE_START_AG: case XFS_ALLOCTYPE_FIRST_AG: /* * Rotate through the allocation groups looking for a winner. */ - if (type == XFS_ALLOCTYPE_ANY_AG) { - /* - * Start with the last place we left off. - */ - args->agno = sagno = (mp->m_agfrotor / rotorstep) % - mp->m_sb.sb_agcount; - args->type = XFS_ALLOCTYPE_THIS_AG; - flags = XFS_ALLOC_FLAG_TRYLOCK; - } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + if (type == XFS_ALLOCTYPE_FIRST_AG) { /* * Start with allocation group given by bno. */ @@ -2687,8 +2677,6 @@ xfs_alloc_vextent( sagno = 0; flags = 0; } else { - if (type == XFS_ALLOCTYPE_START_AG) - args->type = XFS_ALLOCTYPE_THIS_AG; /* * Start with the given allocation group. */ @@ -2756,7 +2744,7 @@ xfs_alloc_vextent( } xfs_perag_put(args->pag); } - if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { + if (bump_rotor) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 1d0f48a501a3..2a8d0fa6fbbe 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -29,9 +29,7 @@ extern struct workqueue_struct *xfs_alloc_wq; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ -#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ #define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ -#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ #define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ #define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ #define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ @@ -41,9 +39,7 @@ extern struct workqueue_struct *xfs_alloc_wq; typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_TYPES \ - { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ - { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ -- cgit v1.2.3